From a115250b5fcf9187248d11f68e6a43ae2959dabf Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 8 May 2024 14:39:54 +0200
Subject: [PATCH 001/341] Re-integrate HPU after upstream refactors (#20)

* Fix setup.py for HPU

* Fix  vllm._C import ops -> vllm.hpu import ops

* more of the same thing

* re-add hpex rmsnorm and rope; but rope is crashing

* remove unnecessary comments

* add vllm/hpu files

* add hpu autodetection

* Add HabanaAttention stub

* revert accidental changes

* revert non-habana backend attention changes

* add habana attention/worker/executor, sampling fails now

* Restore unnecessarily changed files

* enable HabanaMemoryProfiler

* Make sampler pass

* restore habana fused rope

* prefill is now working!!!

* fix prefill padding; decode is now working!!!!!

* revert accidental changes

* remove unused stuff in habana_paged_attn.py

* remove diagnostic stuff from llm_engine.py

* use HabanaExecutorAsync in async_llm_engine.py

* add habana copyright headers to habana_*.py files

* fix prefill attention conformance

* minor naming fixes

* remove naive attention from habana_attn (it never worked anyway)

* re-enable profile run

* Add fake HPUGraph support

* add more metrics

* indentation fix

* ~~recipe cache metrics don't work lalalala~~

* i'm done with metrics for now

* fix corner case in which hl-smi is not available but synapse is

* FIXME: temporary setup.py workaround

* WIP: add tensor parallelism stubs

* habana worker cleanup

* tensor parallelism is now working

* remove unused files

* remove unused func

* add hpugraphrunner

* improve hpu layernorm

* Port pipelined PA

* Port context length bucketing

* remove cudagraphrunner from hpu runner

* restore HPUGraphRunner back from FakeHPUGraphRunner

* handle rotary embeddings properly on gaudi3

* oopsie! captured_block_counts was incorrect!

* captured_block_counts.append doesn't do anything

* Restore habana_main KV cache memory layout

* fix memory profiler

* overhaul hpugraph capture

* memory profiling overhaul

* format memory properly in model warmup

* add graph compilation profiler for graph capture phase

* adroll back log lvl on graph capture message

* Remove unnecessary view on residual connection in RMSNorm (#25)

---------

Co-authored-by: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
---
 pyproject.toml                                |   57 -
 requirements-hpu.txt                          |   15 +
 setup.py                                      |   40 +-
 vllm/attention/backends/habana_attn.py        |  352 +++++
 vllm/attention/ops/habana_paged_attn.py       |  150 +++
 vllm/attention/selector.py                    |   10 +-
 vllm/config.py                                |    4 +-
 vllm/engine/arg_utils.py                      |    2 +-
 vllm/engine/async_llm_engine.py               |    8 +
 vllm/engine/llm_engine.py                     |   15 +-
 vllm/engine/ray_utils.py                      |   26 +-
 vllm/entrypoints/openai/api_server.py         |    2 +-
 vllm/executor/habana_executor.py              |  190 +++
 vllm/executor/ray_habana_executor.py          |  419 ++++++
 vllm/hpu/__init__.py                          |    6 +
 vllm/hpu/attn_bias.py                         |  764 +++++++++++
 vllm/hpu/cache_ops.py                         |   82 ++
 vllm/hpu/ops.py                               |  115 ++
 vllm/hpu/rotary_embed.py                      |  119 ++
 vllm/hpu/utils.py                             |   99 ++
 vllm/hpu/xops.py                              |   66 +
 vllm/model_executor/layers/activation.py      |    6 +-
 vllm/model_executor/layers/layernorm.py       |   23 +-
 .../model_executor/layers/logits_processor.py |   10 +-
 .../model_executor/layers/quantization/awq.py |    7 +-
 .../layers/quantization/gptq.py               |    7 +-
 .../layers/quantization/marlin.py             |    7 +-
 .../layers/quantization/squeezellm.py         |    7 +-
 .../model_executor/layers/rotary_embedding.py |   15 +-
 vllm/model_executor/models/llama.py           |    1 -
 .../parallel_utils/communication_op.py        |   11 +-
 vllm/model_executor/sampling_metadata.py      |   12 +-
 vllm/utils.py                                 |   53 +
 vllm/worker/cache_engine.py                   |   27 +-
 vllm/worker/habana_model_runner.py            | 1168 +++++++++++++++++
 vllm/worker/habana_worker.py                  |  263 ++++
 36 files changed, 4045 insertions(+), 113 deletions(-)
 delete mode 100644 pyproject.toml
 create mode 100644 requirements-hpu.txt
 create mode 100644 vllm/attention/backends/habana_attn.py
 create mode 100644 vllm/attention/ops/habana_paged_attn.py
 create mode 100644 vllm/executor/habana_executor.py
 create mode 100644 vllm/executor/ray_habana_executor.py
 create mode 100644 vllm/hpu/__init__.py
 create mode 100644 vllm/hpu/attn_bias.py
 create mode 100644 vllm/hpu/cache_ops.py
 create mode 100644 vllm/hpu/ops.py
 create mode 100644 vllm/hpu/rotary_embed.py
 create mode 100644 vllm/hpu/utils.py
 create mode 100644 vllm/hpu/xops.py
 create mode 100644 vllm/worker/habana_model_runner.py
 create mode 100644 vllm/worker/habana_worker.py

diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 4d6fb5a362fc7..0000000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,57 +0,0 @@
-[build-system]
-# Should be mirrored in requirements-build.txt
-requires = [
-    "cmake>=3.21",
-    "ninja",
-    "packaging",
-    "setuptools >= 49.4.0",
-    "torch == 2.1.2",
-    "wheel",
-]
-build-backend = "setuptools.build_meta"
-
-[tool.ruff]
-# Allow lines to be as long as 80.
-line-length = 80
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    # "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    # "I",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-]
-
-[tool.mypy]
-python_version = "3.8"
-
-ignore_missing_imports = true
-
-files = "vllm"
-# TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
-
-
-[tool.codespell]
-ignore-words-list = "dout, te, indicies"
-skip = "./tests/prompts"
-
-[tool.isort]
-use_parentheses = true
-skip_gitignore = true
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
new file mode 100644
index 0000000000000..26fd05eb42d2a
--- /dev/null
+++ b/requirements-hpu.txt
@@ -0,0 +1,15 @@
+cmake>=3.21
+ninja  # For faster builds.
+psutil
+ray == 2.9.3
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+fastapi
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+prometheus_client >= 0.18.0
+pynvml == 11.5.0
+triton >= 2.1.0
+outlines == 0.0.34
+pandas
+tabulate
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9c9a428f94683..673c6e709a8f6 100644
--- a/setup.py
+++ b/setup.py
@@ -174,8 +174,19 @@ def build_extensions(self) -> None:
             subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
 
 
+def _is_hpu() -> bool:
+    return True 
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'):
+            is_hpu_available = False
+    return is_hpu_available
+
+
 def _is_cuda() -> bool:
-    return torch.version.cuda is not None and not _is_neuron()
+    return torch.version.cuda is not None and not _is_neuron() and not _is_hpu()
 
 
 def _is_hip() -> bool:
@@ -190,7 +201,6 @@ def _is_neuron() -> bool:
         torch_neuronx_installed = False
     return torch_neuronx_installed
 
-
 def _install_punica() -> bool:
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
@@ -265,6 +275,17 @@ def find_version(filepath: str) -> str:
             return version_match.group(1)
         raise RuntimeError("Unable to find version string.")
 
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run(
+        "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"}
+    )
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0" # when hl-smi is not available
 
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
@@ -286,6 +307,12 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version()) 
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"+gaudi{gaudi_sw_version}"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -318,9 +345,12 @@ def get_requirements() -> List[str]:
     elif _is_neuron():
         with open(get_path("requirements-neuron.txt")) as f:
             requirements = f.read().strip().split("\n")
+    elif _is_hpu():
+        with open(get_path("requirements-hpu.txt")) as f:
+            requirements = f.read().strip().split("\n")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCM or Neuron.")
+            "Unsupported platform, please use CUDA, ROCM, Neuron or HPU.")
 
     return requirements
 
@@ -333,7 +363,7 @@ def get_requirements() -> List[str]:
     if _install_punica():
         ext_modules.append(CMakeExtension(name="vllm._punica_C"))
 
-if not _is_neuron():
+if not (_is_neuron() or _is_hpu()):
     ext_modules.append(CMakeExtension(name="vllm._C"))
 
 package_data = {
@@ -369,6 +399,6 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() or _is_hpu() else {},
     package_data=package_data,
 )
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
new file mode 100644
index 0000000000000..844dc92b315ac
--- /dev/null
+++ b/vllm/attention/backends/habana_attn.py
@@ -0,0 +1,352 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import importlib
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Type
+
+import torch
+import vllm.hpu.xops as xops
+from vllm.hpu.attn_bias import (AttentionBias,
+                                BlockDiagonalCausalMask,
+                                LowerTriangularMaskWithTensorBias)
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata)
+from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
+                                                  HabanaPagedAttentionMetadata)
+from vllm.logger import init_logger
+from vllm.utils import is_hip
+
+logger = init_logger(__name__)
+
+
+class HabanaAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["HabanaAttentionImpl"]:
+        return HabanaAttentionImpl
+
+    @staticmethod
+    def make_metadata(*args, **kwargs) -> "HabanaAttentionMetadata":
+        return HabanaAttentionMetadata(*args, **kwargs)
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                 num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        HabanaPagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
+    """Metadata for HabanaAttentionbackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    # (num_tokens,). The indices of the token slots that input tokens will be
+    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
+    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
+    # in block 0, and 1st slot in block 1, respectively.
+    slot_mapping: torch.Tensor
+    # (batch_size,). The prompt length per sequence. None if it is a decoding.
+    prompt_lens: Optional[List[int]]
+    # prompt_lens stored as a tensor.
+    prompt_lens_tensor: Optional[torch.Tensor]
+    # The number of prompt tokens. Doesn't include padding.
+    num_prompt_tokens: int
+    # The number of generation tokens. Doesn't include padding.
+    num_generation_tokens: int
+
+    # NOTE(sang): Definition of context_len, subquery_len, and seqlen.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seqlen ----------------------|
+    #                                   |- subquery_len -|
+
+    # WARNING(sang): context_len has different definition depending on if it is
+    # prefill vs decoding. When it is prefill, it doesn't include new tokens.
+    # When it is for decoding, it includes a new token.
+
+    # Maximum subquery length in the batch.
+    max_subquery_len: Optional[int]
+    # FIXME: It is for flash attn.
+    # Maximum prompt length in the batch.
+    max_prompt_len: Optional[int]
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    subquery_start_loc: Optional[torch.Tensor]
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor]
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    def __post_init__(self):
+        # Set during the execution of the first attention op.
+        # It is a list because it is needed to set per prompt
+        # when alibi slopes is used. It is because of the limitation
+        # from xformer API.
+        # will not appear in the __repr__ and __init__
+        self.attn_bias: Optional[List[AttentionBias]] = None
+
+
+class HabanaAttentionImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prompt_tokens --------------->|	
+    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1--->|
+
+    Otherwise, the layout is as follows:	
+    |<------------------ num_generation_tokens (M) ----------------->|	
+    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Optional[torch.Tensor],
+        attn_metadata: HabanaAttentionMetadata,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        batch_size, seq_len, hidden_size = query.shape
+        _, seq_len_kv, _ = key.shape
+
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if kv_cache is not None:
+            key_cache, value_cache = HabanaPagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            HabanaPagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache,
+                                                attn_metadata.slot_mapping,
+                                                attn_metadata.kv_cache_dtype, 
+                                                attn_metadata.is_prompt)
+
+        if attn_metadata.is_prompt:
+            # Prompt run.
+            if kv_cache is None or attn_metadata.block_tables.numel() == 0:
+                # normal attention.
+                # block tables are empty if the prompt does not have a cached
+                # prefix.
+                if self.num_kv_heads != self.num_heads:
+                    # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+                    # project the key and value tensors to the desired number of
+                    # heads.
+                    # TODO(woosuk): Use MQA/GQA kernels for higher performance.
+                    query = query.view(query.shape[0], self.num_kv_heads,
+                                       self.num_queries_per_kv,
+                                       query.shape[-1])
+                    key = key[:, :,
+                              None, :].expand(key.shape[0], self.num_kv_heads,
+                                              self.num_queries_per_kv,
+                                              key.shape[-1])
+                    value = value[:, :,
+                                  None, :].expand(value.shape[0],
+                                                  self.num_kv_heads,
+                                                  self.num_queries_per_kv,
+                                                  value.shape[-1])
+
+                if attn_metadata.attn_bias is None:
+                    if self.alibi_slopes is None:
+                        attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                            [seq_len] * batch_size)
+                        if self.sliding_window is not None:
+                            attn_bias = attn_bias.make_local_attention(
+                                self.sliding_window)
+                        attn_metadata.attn_bias = attn_bias
+                    else:
+                        attn_metadata.attn_bias = _make_alibi_bias(
+                            self.alibi_slopes, self.num_kv_heads, batch_size,
+                            seq_len, query.dtype)
+                query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size)
+                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
+                out = xops.memory_efficient_attention_forward(
+                    query.view(query_shape),
+                    key.view(kv_shape),
+                    value.view(kv_shape),
+                    attn_bias=attn_metadata.attn_bias,
+                    p=0.0,
+                    scale=self.scale,
+                )
+                output = out.reshape(batch_size, seq_len, hidden_size)
+            else:
+                # prefix-enabled attention
+                output = HabanaPagedAttention.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.block_tables,
+                    attn_metadata.subquery_start_loc,
+                    attn_metadata.prompt_lens_tensor,
+                    attn_metadata.context_lens,
+                    attn_metadata.max_subquery_len,
+                    self.alibi_slopes,
+                )
+        else:
+            # Decoding run.
+            output = HabanaPagedAttention.forward_decode(
+                query,
+                key_cache,
+                value_cache,
+                attn_metadata.block_tables,
+                attn_metadata.context_lens,
+                attn_metadata.max_context_len,
+                attn_metadata.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+            )
+
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    prompt_lens: List[int],
+) -> LowerTriangularMaskWithTensorBias:
+    attn_biases = []
+    for prompt_len in prompt_lens:
+        bias = torch.arange(prompt_len, dtype=dtype)
+        # NOTE(zhuohan): HF uses
+        #     `bias = bias[None, :].repeat(prompt_len, 1)`
+        # here. We find that both biases give the same results, but
+        # the bias below more accurately follows the original ALiBi
+        # paper.
+        # Calculate a matrix where each element represents ith element- jth
+        # element.
+        bias = bias[None, :] - bias[:, None]
+
+        padded_len = (prompt_len + 7) // 8 * 8
+        num_heads = alibi_slopes.shape[0]
+        bias = torch.empty(
+            1,  # batch size
+            num_heads,
+            prompt_len,
+            padded_len,
+            device=alibi_slopes.device,
+            dtype=dtype,
+        )[:, :, :, :prompt_len].copy_(bias)
+        bias.mul_(alibi_slopes[:, None, None])
+        if num_heads != num_kv_heads:
+            bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+        attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
+
+    return attn_biases
+
+
+def _naive_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    scale: float,
+) -> torch.Tensor:
+    query = query.view(-1, num_heads, head_size)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+    seq_len, _, _ = query.shape
+    attn_mask = torch.triu(torch.ones(seq_len,
+                                      seq_len,
+                                      dtype=query.dtype,
+                                      device=query.device),
+                           diagonal=1)
+    attn_mask = attn_mask * torch.finfo(query.dtype).min
+
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
new file mode 100644
index 0000000000000..03027bb01565c
--- /dev/null
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -0,0 +1,150 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.hpu import cache_ops, ops
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class HabanaPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    # (num_tokens,). The indices of the token slots that input tokens will be
+    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
+    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
+    # in block 0, and 1st slot in block 1, respectively.
+    slot_mapping: torch.Tensor
+    # (batch_size,). The length of context (tokens stored in KV cache) per
+    # sequence. WARNING: When it is a prefill request, it doesn't include new
+    # tokens. When it is for decoding, it includes a new token.
+    context_lens: Optional[torch.Tensor]
+    # Maximum context length in the batch.
+    max_context_len: Optional[int]
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+    kv_cache_dtype: str
+
+
+class HabanaPagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, num_kv_heads, head_size, block_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        key_cache = kv_cache[0]
+        value_cache = kv_cache[1]
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        is_prompt: bool
+    ) -> None:
+        cache_ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slot_mapping,
+            kv_cache_dtype,
+            is_prompt
+        )
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        kv_cache_dtype: str,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        block_size = value_cache.shape[3]
+        return ops.paged_attention_v1(
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+            kv_cache_dtype,
+        )
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        prompt_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_subquery_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention")
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 90fce1a0349b2..1f68e0aad7b59 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -4,7 +4,7 @@
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_hpu
 
 logger = init_logger(__name__)
 
@@ -16,6 +16,11 @@ def get_attn_backend(dtype: torch.dtype) -> AttentionBackend:
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
         return FlashAttentionBackend
+    elif is_hpu():
+        logger.info("Using HabanaAttention backend.")
+        from vllm.attention.backends.habana_attn import (  # noqa: F401
+            HabanaAttentionBackend)
+        return HabanaAttentionBackend        
     else:
         logger.info("Using XFormers backend.")
         from vllm.attention.backends.xformers import (  # noqa: F401
@@ -28,6 +33,9 @@ def _can_use_flash_attn(dtype: torch.dtype) -> bool:
         # AMD GPUs.
         logger.info("Cannot use FlashAttention backend for AMD GPUs.")
         return False
+    if is_hpu():
+        logger.info("Cannot use FlashAttention backend for HPUs.")
+        return False
     if torch.cuda.get_device_capability()[0] < 8:
         # Volta and Turing NVIDIA GPUs.
         logger.info("Cannot use FlashAttention backend for Volta and Turing "
diff --git a/vllm/config.py b/vllm/config.py
index 6070d9d9e50f1..17e25fa2fc00f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,7 +9,7 @@
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
-from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron
+from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron, is_hpu
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -563,6 +563,8 @@ def __init__(self, device: str = "auto") -> None:
             # Automated device type detection
             if is_neuron():
                 self.device_type = "neuron"
+            elif is_hpu():
+                self.device_type = "hpu"
             else:
                 # We don't call torch.cuda.is_available() here to
                 # avoid initializing CUDA before workers are forked
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fc6665dbe64bc..be36db2176d05 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -303,7 +303,7 @@ def add_cli_args(
         parser.add_argument("--device",
                             type=str,
                             default=EngineArgs.device,
-                            choices=["auto", "cuda", "neuron"],
+                            choices=["auto", "cuda", "neuron", 'hpu'],
                             help='Device type for vLLM execution.')
         parser.add_argument(
             '--scheduler-delay-factor',
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index d642915aee192..6b1b9ea32ff76 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -330,6 +330,14 @@ def from_engine_args(cls,
         if device_config.device_type == "neuron":
             raise NotImplementedError("Neuron is not supported for "
                                       "async engine yet.")
+        elif device_config.device_type == "hpu":
+            if parallel_config.worker_use_ray or engine_args.engine_use_ray:
+                initialize_ray_cluster(parallel_config)
+                from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync
+                executor_class = RayHabanaExecutorAsync
+            else:
+                from vllm.executor.habana_executor import HabanaExecutorAsync
+                executor_class = HabanaExecutorAsync
         elif parallel_config.worker_use_ray or engine_args.engine_use_ray:
             initialize_ray_cluster(parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f9638d1101906..56941f876a233 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -66,7 +66,7 @@ def __init__(
         log_stats: bool,
     ) -> None:
         logger.info(
-            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
+            f"Initializing an LLM engine with config: "
             f"model={model_config.model!r}, "
             f"tokenizer={model_config.tokenizer!r}, "
             f"tokenizer_mode={model_config.tokenizer_mode}, "
@@ -132,6 +132,14 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
         if device_config.device_type == "neuron":
             from vllm.executor.neuron_executor import NeuronExecutor
             executor_class = NeuronExecutor
+        elif device_config.device_type == "hpu":
+            if parallel_config.worker_use_ray:
+                initialize_ray_cluster(parallel_config)
+                from vllm.executor.ray_habana_executor import RayHabanaExecutor
+                executor_class = RayHabanaExecutor
+            else:
+                from vllm.executor.habana_executor import HabanaExecutor
+                executor_class = HabanaExecutor
         elif parallel_config.worker_use_ray:
             initialize_ray_cluster(parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutor
@@ -141,7 +149,6 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
                 "Ray is required if parallel_config.world_size > 1.")
             from vllm.executor.gpu_executor import GPUExecutor
             executor_class = GPUExecutor
-
         # Create the LLM engine.
         engine = cls(*engine_configs,
                      executor_class=executor_class,
@@ -419,7 +426,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             self.detokenizer.decode_sequence_inplace(seq,
                                                      seq_group.sampling_params)
             self._check_stop(seq, seq_group.sampling_params)
-
+            #emitted_token = seq.tokens[seq.prefix_offset:] if not seq.status == SequenceStatus.FINISHED_STOPPED else '<EOS>'
+            #print(f'[{seq.status}] Emitted token: {emitted_token} ({seq.get_token_ids()[-1]}) ({seq.output_text!r})')
+ 
         # Non-beam search case
         if not seq_group.sampling_params.use_beam_search:
             # For newly created child sequences, add them to the sequence group
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index 70d5c9b1fae05..991f1b9e443f1 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,9 +1,9 @@
 import pickle
 from typing import List, Optional, Tuple
-
+import os
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
+from vllm.utils import get_ip, is_hip, is_hpu, set_cuda_visible_devices
 
 logger = init_logger(__name__)
 
@@ -100,8 +100,10 @@ def initialize_ray_cluster(
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)
     else:
-        ray.init(address=ray_address, ignore_reinit_error=True)
-
+        ray.init(address=ray_address, ignore_reinit_error=True,
+                 log_to_driver=not os.environ.get('VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0')
+    ray_accel_name = "HPU" if is_hpu() else "GPU"
+    
     if parallel_config.placement_group:
         # Placement group is already set.
         return
@@ -114,24 +116,24 @@ def initialize_ray_cluster(
         # Verify that we can use the placement group.
         gpu_bundles = 0
         for bundle in bundles:
-            bundle_gpus = bundle.get("GPU", 0)
+            bundle_gpus = bundle.get(ray_accel_name, 0)
             if bundle_gpus > 1:
                 raise ValueError(
-                    "Placement group bundle cannot have more than 1 GPU.")
+                    f"Placement group bundle cannot have more than 1 {ray_accel_name}.")
             if bundle_gpus:
                 gpu_bundles += 1
         if parallel_config.world_size > gpu_bundles:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the placement group.")
+                f"The number of required {ray_accel_name}s exceeds the total number of "
+                f"available {ray_accel_name}s in the placement group.")
     else:
-        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
+        num_gpus_in_cluster = ray.cluster_resources().get(ray_accel_name, 0)
         if parallel_config.world_size > num_gpus_in_cluster:
             raise ValueError(
-                "The number of required GPUs exceeds the total number of "
-                "available GPUs in the cluster.")
+                f"The number of required {ray_accel_name}s exceeds the total number of "
+                f"available {ray_accel_name}s in the cluster.")
         # Create a new placement group
-        placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
+        placement_group_specs = ([{ray_accel_name: 1}] * parallel_config.world_size)
         current_placement_group = ray.util.placement_group(
             placement_group_specs)
         # Wait until PG is ready - this will block until all
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 06e8bdf11abd3..9eb9a654d2b41 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -144,7 +144,7 @@ async def authentication(request: Request, call_next):
             raise ValueError(f"Invalid middleware {middleware}. "
                              f"Must be a function or a class.")
 
-    logger.info(f"vLLM API server version {vllm.__version__}")
+    logger.info(f"vLLM API server version")
     logger.info(f"args: {args}")
 
     if args.served_model_name is not None:
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
new file mode 100644
index 0000000000000..dd211eadbea78
--- /dev/null
+++ b/vllm/executor/habana_executor.py
@@ -0,0 +1,190 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from typing import Dict, List, Optional
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.utils import check_block_size_valid
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async, HabanaMemoryProfiler, format_bytes)
+import os
+import contextlib
+logger = init_logger(__name__)
+
+
+class HabanaExecutor(ExecutorBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+
+        # Instantiate the worker and load the model to GPU.
+        self._init_worker()
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+    def _init_worker(self):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.habana_worker import HabanaWorker
+
+        assert self.parallel_config.world_size == 1, (
+            "HabanaExecutor only supports single GPU.")
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = HabanaWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache.
+
+        The engine first profiles the existing memory usage.
+        Then, it allocates the remaining memory for KV blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_gpu_blocks, num_cpu_blocks = (
+            self.driver_worker.profile_num_available_blocks(
+                block_size=self.cache_config.block_size,
+                hpu_memory_utilization=self.cache_config.
+                gpu_memory_utilization,
+                cpu_swap_space=self.cache_config.swap_space_bytes,
+                cache_dtype=self.cache_config.cache_dtype,
+            ))
+
+        logger.info(f"# HPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
+                               self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        with HabanaMemoryProfiler() as cache_init_m:
+            self.driver_worker.init_cache_engine(cache_config=self.cache_config)
+        logger.info(f"init_cache_engine took "
+                    f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        with HabanaMemoryProfiler() as warmup_m:
+            self.driver_worker.warm_up_model()
+        logger.info(f"Model warmup took "
+                    f"{format_bytes(warmup_m.consumed_memory)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS!
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none
+        log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list])
+            max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here
+            max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}'
+            gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = self.driver_worker.execute_model(
+                    seq_group_metadata_list=seq_group_metadata_list,
+                    blocks_to_swap_in=blocks_to_swap_in,
+                    blocks_to_swap_out=blocks_to_swap_out,
+                    blocks_to_copy=blocks_to_copy,
+                )
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
+                logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}")
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all:
+                logger.warning(f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}")
+            
+            return output
+
+        output = self.driver_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def list_loras(self) -> List[int]:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def check_health(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output
+
+    async def check_health_async(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
new file mode 100644
index 0000000000000..dac8eefb18adc
--- /dev/null
+++ b/vllm/executor/ray_habana_executor.py
@@ -0,0 +1,419 @@
+import asyncio
+import copy
+import os
+import pickle
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
+from vllm.engine.ray_utils import RayWorkerVllm, ray
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.utils import check_block_size_valid
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+
+
+class RayHabanaExecutor(ExecutorBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+
+        assert self.parallel_config.worker_use_ray
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            # For single GPU case, we use a ray worker with constrained memory.
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            # Otherwise, the ray workers are allocated with a full GPU.
+            num_gpus = 1
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: RayWorkerVllm = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerVllm] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("HPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=0,
+                resources={'HPU': num_gpus},
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        # Get the set of GPU IDs used on each node.
+        driver_node_id, driver_gpu_ids = ray.get(
+            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
+        worker_node_and_gpu_ids = ray.get(
+            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
+
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+
+        node_workers[driver_node_id].append(0)
+        node_gpus[driver_node_id].extend(driver_gpu_ids)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
+                                               start=1):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.habana_worker import HabanaWorker
+
+        model_config = copy.deepcopy(self.model_config)
+        parallel_config = copy.deepcopy(self.parallel_config)
+        scheduler_config = copy.deepcopy(self.scheduler_config)
+        device_config = copy.deepcopy(self.device_config)
+        lora_config = copy.deepcopy(self.lora_config)
+        kv_cache_dtype = self.cache_config.cache_dtype
+
+        # Initialize the actual workers with the Worker class.
+        for rank, (worker, (node_id, _)) in enumerate(
+                zip(self.workers, worker_node_and_gpu_ids),
+                start=1,
+        ):
+            local_rank = node_workers[node_id].index(rank)
+            worker.init_worker.remote(
+                lambda rank=rank, local_rank=local_rank: HabanaWorker(
+                    model_config,
+                    parallel_config,
+                    scheduler_config,
+                    device_config,
+                    local_rank,
+                    rank,
+                    distributed_init_method,
+                    lora_config=lora_config,
+                    kv_cache_dtype=kv_cache_dtype,
+                ))
+
+        # Initialize the driver worker with the Worker class.
+        driver_rank = 0
+        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
+        self.driver_worker = HabanaWorker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            driver_local_rank,
+            driver_rank,
+            distributed_init_method,
+            lora_config=self.lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=True,
+        )
+
+        self._run_workers("init_device")
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        More details can be found in the
+        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
+        from class :class:`~vllm.worker.Worker`.
+
+        Afterwards, as there may be multiple workers,
+        we take the minimum number of blocks across all workers
+        to ensure this can be applied to all of them.
+
+        Finally, the engine will initialize the KV cache
+        with the calculated number of blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers(
+            "profile_num_available_blocks",
+            block_size=self.cache_config.block_size,
+            hpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+            cpu_swap_space=self.cache_config.swap_space_bytes,
+            cache_dtype=self.cache_config.cache_dtype,
+        )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        logger.info(f"# HPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
+                               self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self._run_workers("warm_up_model")
+
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        all_outputs = self._run_workers(
+            "execute_model",
+            driver_kwargs={
+                "seq_group_metadata_list": seq_group_metadata_list,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            },
+            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
+
+        # Only the driver worker returns the sampling results.
+        output = all_outputs[0]
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "add_lora",
+            lora_request=lora_request,
+        )
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "remove_lora",
+            lora_id=lora_id,
+        )
+
+    def list_loras(self) -> List[int]:
+        return self._run_workers("list_loras")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        driver_args: Optional[List[Any]] = None,
+        driver_kwargs: Optional[Dict[str, Any]] = None,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *args, **kwargs)
+                for worker in self.workers
+            ]
+
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+
+        # Start the driver worker after all the ray workers.
+        driver_worker_output = getattr(self.driver_worker,
+                                       method)(*driver_args, **driver_kwargs)
+
+        # Get the results of the ray workers.
+        if self.workers:
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+
+        from ray.dag import InputNode, MultiOutputNode
+        assert self.parallel_config.worker_use_ray
+
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.bind(input_data)
+                for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
+
+
+class RayHabanaExecutorAsync(RayHabanaExecutor, ExecutorAsyncBase):
+
+    async def _run_workers_async(
+        self,
+        method: str,
+        *args,
+        driver_args: Optional[List[Any]] = None,
+        driver_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        coros = []
+
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+
+        # Run the driver worker asynchronously.
+        driver_executor = make_async(getattr(self.driver_worker, method))
+        coros.append(driver_executor(*driver_args, **driver_kwargs))
+
+        # Run the ray workers asynchronously.
+        for worker in self.workers:
+            coros.append(worker.execute_method.remote(method, *args, **kwargs))
+
+        all_outputs = await asyncio.gather(*coros)
+        return all_outputs
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        all_outputs = await self._run_workers_async(
+            "execute_model",
+            driver_kwargs={
+                "seq_group_metadata_list": seq_group_metadata_list,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            })
+
+        # Only the driver worker returns the sampling results.
+        output = all_outputs[0]
+        return output
+
+    async def check_health_async(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
new file mode 100644
index 0000000000000..b8e4d3aac98a7
--- /dev/null
+++ b/vllm/hpu/__init__.py
@@ -0,0 +1,6 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py
new file mode 100644
index 0000000000000..ff508a59cc56a
--- /dev/null
+++ b/vllm/hpu/attn_bias.py
@@ -0,0 +1,764 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import math
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+
+class AttentionBias:
+    """Base class for a custom bias that can be applied \
+        as the attn_bias argument in
+        :attr:`xformers.ops.memory_efficient_attention`.
+
+    That function has the ability to add a tensor, the
+    attention bias, to the QK^T matrix before it is used
+    in the softmax part of the attention calculation.
+    The attention bias tensor with shape
+    (B or 1, n_queries, number of keys)
+    can be given as the attn_bias input.
+    The most common use case is for an attention bias is
+    to contain only zeros and negative infinities, which forms
+    a mask so that some queries only attend to some keys.
+
+    Children of this class define alternative things which can
+    be used as the attn_bias input to define an attention bias which
+    forms such a mask, for some common cases.
+
+    When using an :attr:`xformers.ops.AttentionBias`
+    instead of a :attr:`torch.Tensor`, the mask matrix does
+    not need to be materialized, and can be
+    hardcoded into some kernels for better performance.
+
+    See:
+
+    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask`
+    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias`
+    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`
+    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`
+
+    """
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """
+        Materializes the bias as a `torch.Tensor`. This is very slow
+        and we don't attempt to make it fast. Only use for debugging/testing.
+
+        Shape should be like `[*, q_seqlen, k_seqlen]`
+        """
+        raise NotImplementedError()
+
+
+class LowerTriangularMask(AttentionBias):
+    """
+    A lower-triangular (aka causal) mask
+
+    A query Q cannot attend to a key which is farther from the
+    initial key than Q is from the initial query.
+    """
+
+    def __init__(self, *tensor_args, **tensor_kwargs) -> None:
+        # NOTE: Unused arguments, we keep them for backward compatibility
+        super().__init__()
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        return torch.triu(tensor, diagonal=1).to(dtype)  # type: ignore
+
+    def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias":
+        return LowerTriangularMaskWithTensorBias(bias)
+
+
+class LowerTriangularMaskWithTensorBias(LowerTriangularMask):
+    """A lower-triangular (aka causal) mask with an additive bias"""
+
+    def __init__(self, bias: torch.Tensor) -> None:
+        self._bias = bias
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return super().materialize(shape, dtype=dtype, device=device) + self._bias
+
+
+@dataclass
+class _SeqLenInfo:
+    """
+    (Internal) Represents the division of a dimension into blocks.
+
+    For example, to represents a dimension of length 7 divided into
+    three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`.
+    The members will be:
+        max_seqlen: 3
+        min_seqlen: 2
+        seqstart_py: [0, 2, 5, 7]
+        seqstart: torch.IntTensor([0, 2, 5, 7])
+    """
+
+    seqstart: torch.Tensor
+    max_seqlen: int
+    min_seqlen: int
+    seqstart_py: List[int]
+
+    def to(self, device: torch.device) -> None:
+        self.seqstart = self.seqstart.to(device, non_blocking=True)
+
+    def intervals(self) -> Iterable[Tuple[int, int]]:
+        yield from zip(self.seqstart_py, self.seqstart_py[1:])
+
+    @classmethod
+    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
+        """
+        Input tensors are assumed to be in shape [B, M, *]
+        """
+        assert not isinstance(seqlens, torch.Tensor)
+        seqstart_py = [0]
+        max_seqlen = -1
+        min_seqlen = -1
+        for seqlen in seqlens:
+            min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen
+            max_seqlen = max(max_seqlen, seqlen)
+            seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen)
+        seqstart = torch.tensor(seqstart_py, dtype=torch.int32)
+        return cls(
+            max_seqlen=max_seqlen,
+            min_seqlen=min_seqlen,
+            seqstart=seqstart,
+            seqstart_py=seqstart_py,
+        )
+
+    def split(
+        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
+    ) -> List[torch.Tensor]:
+        if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1:
+            raise ValueError(
+                f"Invalid `torch.Tensor` of shape {x.shape}, expected format "
+                f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n"
+                f" seqstart: {self.seqstart_py}"
+            )
+        if batch_sizes is None:
+            batch_sizes = [1] * (len(self.seqstart_py) - 1)
+        split_chunks = []
+        it = 0
+        for batch_size in batch_sizes:
+            split_chunks.append(
+                self.seqstart_py[it + batch_size] - self.seqstart_py[it]
+            )
+            it += batch_size
+        return [
+            tensor.reshape([bs, -1, *tensor.shape[2:]])
+            for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1))
+        ]
+
+
+@dataclass
+class _PaddedSeqLenInfo(_SeqLenInfo):
+    """
+    (Internal)  Represents the division of a dimension into blocks which are
+    padded out to the same total length.
+
+    For example, to represent a dimension of length 12 with space for
+    three blocks of length 4, but where the occupied lengths are
+    2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`.
+
+    The layout along the dimension is
+
+     0 ─►  block 0
+           block 0
+           <space>
+           <space>
+     4 ─►  block 1
+           block 1
+           block 1
+           <space>
+     8 ─►  block 2
+           block 2
+           <space>
+           <space>
+    12 ─►
+
+    The members will be:
+        max_seqlen: 3
+        min_seqlen: 2
+        seqstart_py: [0, 4, 8, 12]
+        seqstart: torch.IntTensor([0, 4, 8, 12])
+        seqlen_py: [2, 3, 2]
+        seqlen: torch.IntTensor([2, 3, 2])
+        padding: 4
+    """
+
+    seqlen: torch.Tensor
+    seqlen_py: Sequence[int]
+    padding: int
+    # From parent: seqstart[i] contains the start position
+    # of the i-th sequence
+    # seqstart: torch.Tensor
+
+    def __post_init__(self) -> None:
+        assert len(self.seqstart_py) == len(self.seqlen_py) + 1
+
+    def to(self, device: torch.device) -> None:
+        self.seqlen = self.seqlen.to(device, non_blocking=True)
+        super().to(device)
+
+    def intervals(self) -> Iterable[Tuple[int, int]]:
+        for (start, _), length in zip(super().intervals(), self.seqlen_py):
+            yield start, start + length
+
+    @classmethod
+    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
+        raise RuntimeError(
+            "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`"
+        )
+
+    @classmethod
+    def from_seqlens_padded(
+        cls, seqlens: Sequence[int], padding: int
+    ) -> "_PaddedSeqLenInfo":
+        """
+        Input tensors are assumed to be in shape [B, M, *]
+        seqstart = padding * torch.arange(batch_size)
+        """
+        assert not isinstance(seqlens, torch.Tensor)
+        assert all(seqlen <= padding for seqlen in seqlens)
+        seqstart_py = list(range(0, len(seqlens) * padding + 1, padding))
+        return cls(
+            seqlen=torch.tensor(seqlens, dtype=torch.int32),
+            seqlen_py=seqlens,
+            max_seqlen=max(seqlens),
+            min_seqlen=min(seqlens),
+            seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
+            seqstart_py=seqstart_py,
+            padding=padding,
+        )
+
+    def split(
+        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
+    ) -> List[torch.Tensor]:
+        raise NotImplementedError("_PaddedSeqLenInfo.split")
+
+
+@dataclass
+class BlockDiagonalMask(AttentionBias):
+    """
+    A block-diagonal mask that can be passed as ``attn_bias``
+    argument to :attr:`xformers.ops.memory_efficient_attention`.
+
+    Queries and Keys are each divided into the same number of blocks.
+    Queries in block i only attend to keys in block i.
+
+    .. figure:: /_static/block_diag_bias.png
+
+        This bias can be used to handle a batch of sequences of
+        different lengths, via :attr:`BlockDiagonalMask.from_tensor_list`
+
+    :Example:
+
+    .. code-block:: python
+
+        import torch
+        from xformers.ops import fmha
+
+        K = 16
+        dtype = torch.float16
+        device = "cuda"
+        list_x = [
+            torch.randn([1, 3, 1, K], dtype=dtype, device=device),
+            torch.randn([1, 6, 1, K], dtype=dtype, device=device),
+            torch.randn([1, 2, 1, K], dtype=dtype, device=device),
+        ]
+        attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
+        linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)
+
+        q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
+        out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        list_out = attn_bias.split(out)
+        print(list_out[0].shape)  # [1, 3, 1, K]
+        assert tuple(list_out[0].shape) == (1, 3, 1, K)
+
+    """
+
+    q_seqinfo: _SeqLenInfo
+    k_seqinfo: _SeqLenInfo
+    _batch_sizes: Optional[Sequence[int]] = None
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return torch.zeros(
+            shape,
+            dtype=dtype,
+            device=device,
+        )
+
+    def materialize(
+        self,
+        shape: Optional[Tuple[int, ...]] = None,
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """Materialize the attention bias - for debugging & testing"""
+        if shape is None:
+            shape = (self.q_seqinfo.seqstart_py[-1],
+                     self.k_seqinfo.seqstart_py[-1])
+        assert shape[-1] == self.k_seqinfo.seqstart_py[-1], (
+            shape[-1],
+            self.k_seqinfo.seqstart_py[-1],
+        )
+        assert shape[-2] == self.q_seqinfo.seqstart_py[-1], (
+            shape[-2],
+            self.q_seqinfo.seqstart_py[-1],
+        )
+        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
+        mask.fill_(-math.inf)
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
+                (q_end - q_start, k_end - k_start),
+                dtype=dtype,
+                device=device,
+            )
+        for _ in range(len(shape) - 2):
+            mask = mask.unsqueeze(0)
+        return mask.expand(shape)
+
+    @classmethod
+    def from_seqlens(
+        cls,
+        q_seqlen: Sequence[int],
+        kv_seqlen: Optional[Sequence[int]] = None,
+    ) -> "BlockDiagonalMask":
+        """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value.
+
+        Args:
+            q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors
+            kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value.
+                    (Defaults to ``q_seqlen``.)
+        Returns:
+            BlockDiagonalMask
+        """
+        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen)
+        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
+        if kv_seqlen is None or q_seqlen == kv_seqlen:
+            k_seqinfo = q_seqinfo
+        else:
+            k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen)
+        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
+
+    @classmethod
+    def from_tensor_list(
+        cls,
+        tensors: Sequence[torch.Tensor],
+    ) -> Tuple["BlockDiagonalMask", torch.Tensor]:
+        """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors
+        concatenated on the sequence length dimension
+
+        .. figure:: /_static/block_diag_cat_split.png
+
+            See also :attr:`BlockDiagonalMask.split` to split the returned
+            :attr:`torch.Tensor` back to a list of tensors of varying sequence length
+
+        Args:
+            tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``.
+                All tensors should have the same dimension and the same batch size ``B``, but
+                they can have different sequence length ``M``.
+
+        Returns:
+            Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention
+            along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]``
+        """
+        batch_sizes = [tensor.shape[0] for tensor in tensors]
+        seqlens = []
+        for x in tensors:
+            for _ in range(x.shape[0]):
+                seqlens.append(x.shape[1])
+        block_diag = cls.from_seqlens(seqlens)
+        block_diag._batch_sizes = batch_sizes
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors)
+        concat_tensors = torch.cat(tensors_bs1, dim=1)
+        return block_diag, concat_tensors
+
+    @classmethod
+    def from_tensor_lists_qkv(
+        cls,
+        tensors_q: Sequence[torch.Tensor],
+        tensors_k: Sequence[torch.Tensor],
+        tensors_v: Optional[Sequence[torch.Tensor]] = None,
+    ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        assert len(tensors_q) == len(tensors_k)
+        assert tensors_v is None or len(tensors_v) == len(tensors_q)
+        batch_sizes = [tensor.shape[0] for tensor in tensors_q]
+        q_seqlens, kv_seqlens = [], []
+        for i, (q, k) in enumerate(zip(tensors_q, tensors_k)):
+            assert q.shape[0] == k.shape[0]
+            q_seqlens += [q.shape[1]] * q.shape[0]
+            kv_seqlens += [k.shape[1]] * k.shape[0]
+            assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2]
+        block_diag = cls.from_seqlens(q_seqlens, kv_seqlens)
+        block_diag._batch_sizes = batch_sizes
+        return (
+            block_diag,
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1),
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1),
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1)
+            if tensors_v is not None
+            else None,
+        )
+
+    def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        return self.q_seqinfo.split(tensor, self._batch_sizes)
+
+    def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        return self.k_seqinfo.split(tensor, self._batch_sizes)
+
+    def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list`
+
+        Args:
+            tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]``
+
+        Returns:
+            Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths
+        """
+        assert self.q_seqinfo is self.k_seqinfo
+        return self.q_seqinfo.split(tensor, self._batch_sizes)
+
+    def make_causal(self) -> "BlockDiagonalCausalMask":
+        """Makes each block causal"""
+        return BlockDiagonalCausalMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+        )
+
+    def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask":
+        """Makes each block causal with a possible non-causal prefix"""
+        return BlockDiagonalCausalFromBottomRightMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+        )
+
+    def make_local_attention(
+        self, window_size: int
+    ) -> "BlockDiagonalCausalLocalAttentionMask":
+        """Experimental: Makes each block causal with local attention"""
+        return BlockDiagonalCausalLocalAttentionMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+            _window_size=window_size,
+        )
+
+    def make_local_attention_from_bottomright(
+        self, window_size: int
+    ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask":
+        """Experimental: Makes each block causal with local attention, start from bottom right"""
+        return BlockDiagonalCausalLocalAttentionFromBottomRightMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+            _window_size=window_size,
+        )
+
+
+@dataclass
+class BlockDiagonalCausalMask(BlockDiagonalMask):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
+
+    Queries and Keys are each divided into the same number of blocks.
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which is farther from the initial key in block i than Q
+    is from the initial query in block i.
+    """
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return LowerTriangularMask().materialize(
+            shape,
+            dtype=dtype,
+            device=device,
+        )
+
+
+@dataclass
+class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
+    This mask allows for a non-causal prefix
+    NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not
+    defined (softmax of vector of `-inf` in the attention)
+
+    Queries and keys are each divided into the same number of blocks.
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which nearer the final key in block i than Q is to the
+    final query in block i.
+    """
+
+    def __post_init__(self) -> None:
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            num_queries = q_end - q_start
+            num_keys = k_end - k_start
+            if num_keys < num_queries:
+                raise ValueError(
+                    f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}."
+                    " Expected `num_keys >= num_queries`"
+                )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype)  # type: ignore
+
+
+@dataclass
+class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`,
+    except an offset on causality is allowed for each block and we support padding for k/v
+
+    The keys and values are divided into blocks which are padded out to
+    the same total length.
+    For example, if there is space for 12 keys, for three blocks of
+    max length 4, but we only want to use the first 2, 3 and 2
+    of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`.
+    The queries are divided into blocks, without padding, of lengths given by
+    q_seqlen.
+
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which is not in use (i.e. in the padded area),
+    nor one which is nearer to the final key in block i
+    than Q is to the final query in block i.
+    """
+
+    q_seqinfo: _SeqLenInfo
+    k_seqinfo: _PaddedSeqLenInfo
+    causal_diagonal: Any = None  # unused. Exists for BC only.
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype)  # type: ignore
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """Materialize the attention bias - for debugging & testing"""
+        if shape[-1] != self.k_seqinfo.seqstart_py[-1]:
+            raise ValueError("k shapes wrong")
+        if shape[-2] != self.q_seqinfo.seqstart_py[-1]:
+            raise ValueError("q shapes wrong")
+        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
+        mask.fill_(-math.inf)
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
+                (q_end - q_start, k_end - k_start),
+                dtype=dtype,
+                device=device,
+            )
+        for _ in range(len(shape) - 2):
+            mask = mask.unsqueeze(0)
+        return mask.expand(shape)
+
+    @classmethod
+    def from_seqlens(
+        cls,
+        q_seqlen: Sequence[int],
+        kv_padding: int,
+        kv_seqlen: Sequence[int],
+        causal_diagonal: Any = None,
+    ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask":
+        """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor
+        lengths for query and key/value.
+
+        Args:
+            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
+            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
+            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
+            causal_diagonal: unused, for BC only
+        Returns:
+            BlockDiagonalCausalWithOffsetPaddedKeysMask
+        """
+        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), (
+            q_seqlen,
+            kv_seqlen,
+        )
+        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
+        k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding)
+        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
+
+
+@dataclass
+class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask):
+    """
+    (Experimental feature)
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
+    This makes the mask "local" and the attention pattern banded.
+
+    Query i only attends to keys in its block and cannot attend keys further than "window_size"
+    from it.
+    """
+
+    _window_size: int = 0  # forced due to inheritance and default arguments
+
+    def __post_init__(self):
+        if self._window_size <= 0:
+            raise ValueError(
+                f"Expected `window_size > 0`, but window_size={self._window_size}"
+            )
+        q_seqlen = [
+            y - x
+            for x, y in zip(
+                self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:]
+            )
+        ]
+        kv_seqlen = [
+            y - x
+            for x, y in zip(
+                self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:]
+            )
+        ]
+        for q, k in zip(q_seqlen, kv_seqlen):
+            if q - self._window_size >= k:
+                # Each query only attends to keys no further than window_size back.
+                # When q > k + window_size, there will be a query for which the window doesn't reach any key.
+                raise RuntimeError(
+                    f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}"
+                )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=1,
+            device=device,
+        )
+
+        num_queries, num_keys = shape[-2:]
+        mask = torch.tril(tensor, diagonal=0).to(dtype)  # type: ignore
+        if self._window_size is not None and self._window_size > 0:
+            mask = torch.triu(mask, diagonal=-self._window_size + 1)
+        mask = torch.log(mask)
+        return mask.to(dtype)
+
+
+@dataclass
+class BlockDiagonalCausalLocalAttentionFromBottomRightMask(
+    BlockDiagonalCausalFromBottomRightMask
+):
+    """
+    (Experimental feature)
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
+    This makes the mask "local" and the attention pattern banded.
+
+    Query i only attends to keys in its block and cannot attend keys further than "window_size"
+    from it.
+    """
+
+    _window_size: int = 0  # forced due to inheritance and default arguments
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self._window_size <= 0:
+            raise ValueError(
+                f"Expected `window_size > 0`, but window_size={self._window_size}"
+            )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=1,
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype)  # type: ignore
+        if self._window_size is not None:
+            mask = torch.triu(
+                mask, diagonal=num_keys - num_queries - self._window_size + 1
+            )
+        mask = torch.log(mask)
+        return mask.to(dtype)
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
new file mode 100644
index 0000000000000..759fdb65e08ed
--- /dev/null
+++ b/vllm/hpu/cache_ops.py
@@ -0,0 +1,82 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+from typing import Tuple
+import torch
+import habana_frameworks.torch as htorch
+
+
+def pad_to_full_block(data, block_size, pad_value):
+    seq_dim = 1
+    pad_shape = list(data.shape)
+    remainder = pad_shape[seq_dim] % block_size
+    if remainder == 0:
+        return data
+    pad_shape[seq_dim] = block_size - remainder
+    pad = torch.full(pad_shape, pad_value, dtype=data.dtype, device=data.device)
+    return torch.cat([data, pad], dim=seq_dim)
+
+
+def initialize_cache(data, indices, cache):
+    block_size = cache.size(-1)
+    data = data.unflatten(0, (-1, block_size)).permute(0, 2, 3, 1)
+    indices = indices.unflatten(0, (-1, block_size))[:,0]
+    cache.index_copy_(0, indices, data)
+
+
+def update_cache(data, indices, offsets, cache):
+    prev = cache.index_select(0, indices)
+    idx = offsets.view(-1, 1, 1, 1).expand(-1, data.size(1), data.size(2), -1)
+    prev.scatter_(-1, idx, data.unsqueeze(-1))
+    cache.index_copy_(0, indices, prev)
+
+
+def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt):
+    block_size = key_cache.size(-1)
+    assert slot_mapping.dim() == 2, 'This implementation requires unflattened slot_mapping!'
+
+    if is_prompt:
+        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+        batch_size, seq_length = block_indices.shape
+        key = pad_to_full_block(key.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1)
+        value = pad_to_full_block(value.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1)
+        block_indices = pad_to_full_block(block_indices, block_size, -1).flatten(0, 1)
+        initialize_cache(key, block_indices, key_cache)
+        initialize_cache(value, block_indices, value_cache)
+    else:
+        slot_mapping = slot_mapping.flatten()
+        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+        block_offsets = torch.fmod(slot_mapping, block_size)
+        update_cache(key, block_indices, block_offsets, key_cache)
+        update_cache(value, block_indices, block_offsets, value_cache)
+
+
+def swap_blocks(src, dst, block_mapping):
+    index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    for src_idx, dst_idx in block_mapping.items():
+        index_src[0] = src_idx
+        index_dst[0] = dst_idx
+        dst.index_put_([index_dst], src.index_select(0, index_src))
+    if dst.device.type == 'hpu':
+        htorch.core.mark_step()
+        torch.hpu.synchronize()
+
+
+def copy_blocks(key_caches, value_caches, block_mapping):
+    index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    for src, dsts in block_mapping.items():
+        index_src[0] = src
+        for dst in dsts:
+            index_dst[0] = dst
+            for key_cache in key_caches:
+                key_cache.index_copy_(0, index_dst, key_cache.index_select(0, index_src))
+            for value_cache in value_caches:
+                value_cache.index_copy_(0, index_dst, value_cache.index_select(0, index_src))
+        if key_caches[0].device.type == 'hpu':
+            htorch.core.mark_step()
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
new file mode 100644
index 0000000000000..d95b301697cea
--- /dev/null
+++ b/vllm/hpu/ops.py
@@ -0,0 +1,115 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import habana_frameworks.torch as htorch
+from typing import List, Optional, Tuple
+
+import vllm.hpu.utils as hpu_utils
+
+PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '0') == '1')
+
+
+def silu_and_mul(output, input):
+    d = input.shape[-1] // 2
+    silu = torch.nn.SiLU().to(input.device)
+    x, y = torch.split(input, d, dim=-1)
+    output.copy_(silu(x) * y)
+
+
+def gelu_new(output, input):
+    raise NotImplementedError
+
+
+def gelu_fast(output, input):
+    raise NotImplementedError
+
+
+def fetch_from_cache(cache, blocks):
+    return [cache.index_select(0, blocks[:, i]) for i in range(blocks.size(1))]
+
+
+@hpu_utils.with_mark_steps
+def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, kv_cache_dtype=None)  -> None:
+    seq_len = block_tables.size(1)
+    batch_size, query_heads, _ = query.shape
+    _, kv_heads, _, _ = key_cache.shape
+    min_inf = torch.finfo(query.dtype).min
+    mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device)
+            .view(1, -1)
+            .expand(batch_size, -1)
+            .ge(context_lens.view(-1, 1))
+            .view(batch_size, 1, 1, -1))
+    query = query.unsqueeze(-2)
+    keys = fetch_from_cache(key_cache, block_tables)
+    if query_heads != kv_heads:
+        query = query.unflatten(1, (kv_heads, -1))
+        keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
+        mask = mask.unsqueeze(2)
+
+    attn_weights = [torch.matmul(query, k) for k in keys]
+    attn_weights = (torch.cat(attn_weights, dim=-1)
+                    .mul_(scale)
+                    .masked_fill(mask, min_inf)
+                    .softmax(dim=-1))
+
+    values = fetch_from_cache(value_cache, block_tables)
+    if PA_SPLIT_VALUE:
+        attn_weights = attn_weights.split(block_size, dim=-1)
+    else:
+        values = [torch.cat(values, dim=-1)]
+        attn_weights = [attn_weights]
+    if query_heads != kv_heads:
+        values = [v.unflatten(1, (kv_heads, 1)) for v in values]
+    attn_weights = [torch.matmul(a, v.transpose(-1, -2)).squeeze(-2) for a, v in zip(attn_weights, values)]
+    if query_heads != kv_heads:
+        attn_weights = [a.flatten(1, 2) for a in attn_weights]
+    attn_weights = sum(attn_weights)
+
+    return attn_weights
+
+
+def rms_norm(out, hidden_states, weight, eps):
+    htorch.core.mark_step()
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    out.copy_(weight * hidden_states.to(input_dtype))
+    htorch.core.mark_step()
+
+
+def rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    rotate_fn = rotate_neox if is_neox_style else rotate_gptj
+    q_embed = (q * cos) + (rotate_fn(q) * sin)
+    k_embed = (k * cos) + (rotate_fn(k) * sin)
+    return q_embed, k_embed
+
+
+def awq_gemm(*args):
+    raise NotImplementedError
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
new file mode 100644
index 0000000000000..30f96153cd4a2
--- /dev/null
+++ b/vllm/hpu/rotary_embed.py
@@ -0,0 +1,119 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+import torch
+import torch.nn as nn
+import habana_frameworks.torch.utils.experimental as htexp
+
+def get_device_type():
+    return htexp._get_device_type()
+
+def is_gaudi1():
+    return get_device_type() == htexp.synDeviceType.synDeviceGaudi
+
+def is_gaudi2():
+    return get_device_type() == htexp.synDeviceType.synDeviceGaudi2
+
+def is_gaudi3():
+    return get_device_type() == htexp.synDeviceType.synDeviceGaudi3
+
+# TODO: remove this workaround when FusedRoPE properly works on Gaudi
+if not is_gaudi1() and (is_gaudi2() or is_gaudi3()):
+    try:
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE
+    except ImportError:
+        print("Not using HPU fused kernel for apply_rotary_pos_emb")
+        FusedRoPE = None
+else:
+    FusedRoPE = None
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids]#.unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids]#.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class HpuRotaryEmbedding(nn.Module):
+    def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='hpu'):
+        super().__init__()
+
+        self.head_size = head_size
+        self.dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
+        if query.dim() == 2:
+            query = query.unsqueeze(0)
+        if key.dim() == 2:
+            key = key.unsqueeze(0)
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+        seq_len = key.shape[-2]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
+
+        cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
+        query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
+        key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
+        if query.device.type == "hpu" and FusedRoPE:
+            if len(positions[0]) == 1:
+                cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+                sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+            else:
+                cos = cos[positions].unsqueeze(2)
+                sin = sin[positions].unsqueeze(2)
+            query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
+        else:
+            query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
+        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
new file mode 100644
index 0000000000000..8d7f388cf262a
--- /dev/null
+++ b/vllm/hpu/utils.py
@@ -0,0 +1,99 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+import habana_frameworks.torch as htorch
+
+def with_mark_steps(fn):
+    def wrapped(*args, **kwargs):
+        htorch.core.mark_step()
+        result = fn(*args, **kwargs)
+        del args
+        del kwargs
+        htorch.core.mark_step()
+        return result
+    return wrapped
+
+
+def profile_reicpes(recipe_names):
+    from pathlib import Path
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklearn.metrics import ConfusionMatrixDisplay
+    import tqdm
+    recipe_names_short = [name.replace('.graph_dumps/HabanaFusedOpLazy_', '') for name in recipe_names]
+    recipes = [Path(Path.cwd().joinpath(name + '-PostGraph-symbol.pbtxt')).open('r').read() for name in recipe_names]
+
+    def generic_similarity_backend(recipes, similarity_func, backend_name=''):
+        num_recipes = len(recipes)
+        sim_tri = np.zeros((num_recipes, num_recipes))
+        total = (num_recipes * (num_recipes + 1)) // 2 - num_recipes
+        backend_txt = f' with {backend_name}' if backend_name != '' else ''
+        with tqdm.tqdm(total=total, desc=f" computing similarity matrix{backend_txt}") as pbar:
+            for i in range(num_recipes):
+                for j in range(i):
+                    sim_tri[i,j] = similarity_func(recipes[i], recipes[j])
+                    pbar.update(1)
+        sim = sim_tri.T + sim_tri
+        sim_idx = np.arange(sim_tri.shape[0])
+        sim[sim_idx,sim_idx] = 1
+        return sim 
+
+    def cosine_similarity_rad_backend(recipes):
+        from strsimpy.cosine import Cosine
+        s = Cosine(2)
+        return generic_similarity_backend(recipes, s.similarity, "Cosine (rad)"), "cosine similarity, 1 = max similarity"
+
+    def cosine_similarity_deg_backend(recipes):
+        from strsimpy.cosine import Cosine
+        s = Cosine(2)
+        rad = generic_similarity_backend(recipes, s.similarity, "cosine similarity")
+        deg = np.degrees(np.arccos(rad))
+        return deg, "cosine similarity (deviation in deg, 0 = max similarity)"
+
+    def overlap_coefficient_backend(recipes):
+        from strsimpy.overlap_coefficient import OverlapCoefficient
+        s = OverlapCoefficient(2)
+        return generic_similarity_backend(recipes, s.similarity, OverlapCoefficient.__name__),  OverlapCoefficient.__name__
+
+    def normalized_levenshtein_backend(recipes):
+        from strsimpy.normalized_levenshtein import NormalizedLevenshtein
+        s = NormalizedLevenshtein()
+        return generic_similarity_backend(recipes, s.similarity, NormalizedLevenshtein.__name__), NormalizedLevenshtein.__name__
+
+    def jaro_winkler_backend(recipes):
+        from strsimpy.jaro_winkler import JaroWinkler
+        s = JaroWinkler()
+        return generic_similarity_backend(recipes, s.similarity, JaroWinkler.__name__), JaroWinkler.__name__
+    
+    def tfidf_weird_backend(recipes):
+        def tfidf_single_elem(x,y):
+            from sklearn.feature_extraction.text import TfidfVectorizer
+            vect = TfidfVectorizer() 
+            tfidf = vect.fit_transform([x,y])                                                                                                                                                                                                                       
+            sim_sparse = tfidf * tfidf.T 
+            sim = sim_sparse.toarray()
+            return sim[0,1]
+        return generic_similarity_backend(recipes, tfidf_single_elem, 'TfidfVectorizer (weird)'), 'TfidfVectorizer (weird)'
+    
+    def tfidf_backend(recipes):
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        vect = TfidfVectorizer() 
+        tfidf = vect.fit_transform(recipes)                                                                                                                                                                                                                       
+        sim_sparse = tfidf * tfidf.T 
+        sim = sim_sparse.toarray()
+        return sim, 'TfidfVectorizer'
+    
+    sim, backend_name = tfidf_backend(recipes)
+    plt.rcParams["figure.figsize"] = [16,16]
+    plt.rcParams["figure.dpi"] = 300
+    cm = ConfusionMatrixDisplay(sim, display_labels=recipe_names_short)
+    cm.plot(xticks_rotation='vertical', text_kw={"fontsize":5})
+    cm.ax_.set_xlabel("Target recipe number")
+    cm.ax_.set_ylabel("Source recipe number")
+    plt.title(f'Recipe similarity ({backend_name})')
+    return plt
+#    plt.savefig('similarity.png')
\ No newline at end of file
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
new file mode 100644
index 0000000000000..c9d237744a917
--- /dev/null
+++ b/vllm/hpu/xops.py
@@ -0,0 +1,66 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+import habana_frameworks.torch as htorch
+import torch
+import torch.nn.functional as F
+from typing import List, Optional, Tuple, Union
+from .attn_bias import AttentionBias, BlockDiagonalCausalMask
+
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+except ImportError:
+    print("Not using HPU fused scaled dot-product attention kernel.")
+    FusedSDPA = None
+
+def memory_efficient_attention_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    assert attn_bias is not None, "Attention mask is required for prompt processing"
+    dim = query.dim()
+    is_causal = isinstance(attn_bias, BlockDiagonalCausalMask)
+    if FusedSDPA and (is_causal or attn_bias is None):
+        bs = query.shape[0]
+        seq_len_q = query.shape[1]
+        seq_len_kv = key.shape[1]
+        heads = query.shape[-2] if dim != 5 else query.shape[-3]
+        attn_groups = 1 if dim != 5 else query.shape[-2]
+        head_dim = query.shape[-1]
+        if dim == 4:
+            # [bs, seq_len, 1, heads, head_dim] -> [bs, heads, seq_len, head_dim]
+            query = query.reshape(bs, seq_len_q, heads, head_dim).permute(0, 2, 1, 3)
+            key = key.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3)
+            value = value.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3)
+        elif dim == 5:
+            # [bs, seq_len, heads, attn_groups, head_dim] -> [bs, heads, attn_groups, seq_len, head_dim]
+            query = query.reshape(bs, seq_len_q, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
+            key = key.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
+            value = value.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
+        else:
+            raise ValueError(f"Unsupported attention dimension: {dim}")
+
+        import habana_frameworks.torch.hpu as ht
+        with ht.sdp_kernel(enable_recompute=False):  # (flash_attention_recompute and q_len == 1)):
+            out = FusedSDPA.apply(
+                query, key, value, None, p, is_causal, scale
+            )
+        htorch.core.mark_step()
+        if dim == 4:
+            # [bs, heads, seq_len, head_dim] -> [bs, seq_len, heads, head_dim]
+            out = out.permute(0, 2, 1, 3).reshape(bs, seq_len_q, heads, head_dim)
+        elif dim == 5:
+            # [bs, heads, attn_groups, seq_len, head_dim] -> [bs, seq_len, heads, attn_groups, head_dim] 
+            out = out.permute(0, 3, 1, 2, 4).reshape(bs, seq_len_q, heads, attn_groups, head_dim)
+    else:
+       raise NotImplementedError(f'Only FusedSDPA causal or non-masked attention is supported.\nFusedSDPA support: {FusedSDPA is not None}\nis_causal: {is_causal}\nmask_present: {attn_bias is not None}')
+
+    return out
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index f569a5a49cbdf..4435748899af2 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -6,7 +6,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index cb3cee2bad5ad..e194905f68770 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,8 +4,18 @@
 import torch
 import torch.nn as nn
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 
+if is_hpu():
+    try:
+        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+    except ImportError:
+        print("Not using HPU fused kernel for RMSNorm")
+        FusedRMSNorm = None
 
 class RMSNorm(nn.Module):
     """Root mean square normalization.
@@ -49,6 +59,13 @@ def forward(
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         if residual is not None:
+            if x.device.type == "hpu" and FusedRMSNorm:
+                orig_dtype = x.dtype
+                orig_shape = x.shape
+                residual += x.view(residual.shape)
+                # Note: FusedRMSNorm requires 3D tensors as inputs
+                x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon)
+                return x.to(orig_dtype).view(orig_shape), residual
             ops.fused_add_rms_norm(
                 x,
                 residual,
@@ -56,6 +73,10 @@ def forward(
                 self.variance_epsilon,
             )
             return x, residual
+        if x.device.type == "hpu" and FusedRMSNorm:
+            orig_dtype = x.dtype
+            x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon)
+            return x.to(orig_dtype)
         out = torch.empty_like(x)
         ops.rms_norm(
             out,
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 28e8f6bb7e638..6661164b0b53f 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 
 from vllm.model_executor.parallel_utils.communication_op import (
-    tensor_model_parallel_gather)
+    tensor_model_parallel_gather, tensor_model_parallel_all_gather)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-
+from vllm.utils import is_hpu
 
 class LogitsProcessor(nn.Module):
     """Process logits and apply logits processors from sampling metadata.
@@ -51,7 +51,7 @@ def forward(
             # Get the logits for the next tokens.
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
-        if logits is not None:
+        if logits is not None and sampling_metadata.perform_sampling:
             logits *= self.scale
 
             # Apply logits processors (if any).
@@ -65,7 +65,9 @@ def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
         logits = torch.matmul(hidden_states, embedding.t())
         if embedding_bias is not None:
             logits += embedding_bias
-        logits = tensor_model_parallel_gather(logits)
+        # NOTE(kzawora): HPU PT bridge is missing support for single-rank gather. We'll use all-gather on Gaudi for now.
+        gather_op = tensor_model_parallel_all_gather if is_hpu() else tensor_model_parallel_gather
+        logits = gather_op(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[:, :self.org_vocab_size]
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 2caef5f1ebf50..633b094903a8f 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -3,7 +3,12 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
+
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 53baf710ed811..07471a0856c7a 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -6,7 +6,12 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
+
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 784229878edf4..9803e693b7484 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -3,7 +3,12 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
+
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index ed25455e6ec1f..d2813e8d31265 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -3,7 +3,12 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
+
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d80e73bbe39e9..9cd243ea47a77 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -27,7 +27,12 @@
 import torch
 import torch.nn as nn
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+    from vllm.hpu.rotary_embed import HpuRotaryEmbedding
+else:
+    from vllm._C import ops
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -353,8 +358,12 @@ def get_rope(
         return _ROPE_DICT[key]
 
     if rope_scaling is None:
-        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                     is_neox_style)
+        if is_hpu():
+            rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                            is_neox_style)
+        else:
+            rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                            is_neox_style)
     else:
         scaling_type = rope_scaling["type"]
         scaling_factor = rope_scaling["factor"]
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2cd56f0ce59d8..5d637e1585f55 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -216,7 +216,6 @@ def forward(
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
         )
-
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index 04b30b4d093d7..382d884f0315e 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -10,7 +10,7 @@
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size, is_cupy_nccl_enabled_for_all_reduce)
-
+from vllm.utils import is_hpu
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group.
@@ -161,9 +161,10 @@ def broadcast_tensor_dict(
         metadata_list = []
         for key, value in tensor_dict.items():
             if isinstance(value, torch.Tensor):
-                assert value.is_cuda, (
-                    f"Tensor {key}: {value} is not on cuda. Currently we only "
-                    f"support broadcasting tensors on cuda.")
+                if not is_hpu():
+                    assert value.is_cuda, (
+                        f"Tensor {key}: {value} is not on cuda or HPU. Currently we only "
+                        f"support broadcasting tensors on cuda or HPU.")
                 metadata_list.append(
                     (key, TensorMetadata(value.dtype, value.size())))
             else:
@@ -187,7 +188,7 @@ def broadcast_tensor_dict(
             if isinstance(value, TensorMetadata):
                 tensor = torch.empty(value.size,
                                      dtype=value.dtype,
-                                     device="cuda")
+                                     device="hpu" if is_hpu() else "cuda")
                 async_handle = torch.distributed.broadcast(tensor,
                                                            src=src,
                                                            async_op=True,
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 534cb75c2fd2f..9e343d11b151d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -7,7 +7,7 @@
 from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData
-from vllm.utils import is_pin_memory_available
+from vllm.utils import is_pin_memory_available, is_hpu
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
@@ -269,19 +269,19 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sample_indices_t = torch.tensor(
             sample_indices,
             device="cpu",
-            dtype=torch.long,
+            dtype=torch.int,
             pin_memory=pin_memory,
         )
         prompt_tensor = torch.tensor(
             prompt_padded_tokens,
             device="cpu",
-            dtype=torch.long,
+            dtype=torch.int,
             pin_memory=pin_memory,
         )
         output_tensor = torch.tensor(
             output_padded_tokens,
             device="cpu",
-            dtype=torch.long,
+            dtype=torch.int,
             pin_memory=pin_memory,
         )
         # need to transpose and make contiguous to
@@ -290,7 +290,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sampling_seeds_t = torch.tensor(
             sampling_seeds,
             device="cpu",
-            dtype=torch.long,
+            dtype=torch.int,
             pin_memory=pin_memory,
         ).T.contiguous()
 
@@ -339,7 +339,7 @@ def _get_sequence_seeds(
             else:
                 generator = random.Random(str((seed, ) + extra_entropy))
                 randint_fn = generator.randint
-            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
+            lo, hi = torch.iinfo(torch.int).min, torch.iinfo(torch.int).max
             # If the user/random sets seed = 0 but request should
             # have sampling, we need to change it to something
             # else. We use a constant in that case.
diff --git a/vllm/utils.py b/vllm/utils.py
index 4b9558ffe88d8..941be08047967 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,6 +6,7 @@
 import subprocess
 import uuid
 import warnings
+import importlib
 from collections import OrderedDict
 from functools import lru_cache, partial
 from platform import uname
@@ -125,6 +126,9 @@ def is_neuron() -> bool:
         transformers_neuronx = None
     return transformers_neuronx is not None
 
+@lru_cache(maxsize=None)
+def is_hpu() -> bool:
+    return importlib.util.find_spec('habana_frameworks') is not None
 
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
@@ -350,6 +354,9 @@ def is_pin_memory_available() -> bool:
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
+    elif is_hpu():
+        print_warning_once("Pin memory is not supported on HPU.")
+        return False
     return True
 
 
@@ -377,6 +384,52 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         gc.collect()
 
 
+
+
+class HabanaMemoryProfiler:
+
+    def __init__(self, device=None):
+        self.device = device
+
+    def current_memory_usage() -> float:
+        # Return the memory usage in bytes.
+        free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
+        return total_hpu_memory - free_hpu_memory
+    
+    def current_free_memory() -> float:
+        # Return the memory usage in bytes.
+        free_hpu_memory, _ = torch.hpu.mem_get_info()
+        return free_hpu_memory
+    
+    def total_memory() -> float:
+        # Return the memory usage in bytes.
+        _, total_hpu_memory = torch.hpu.mem_get_info()
+        return total_hpu_memory
+
+    def __enter__(self):
+        self.initial_memory = HabanaMemoryProfiler.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = HabanaMemoryProfiler.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
+
+# Adapted from https://stackoverflow.com/a/49361727
+def format_bytes(size):
+    # 2**10 = 1024
+    power = 2**10
+    n = 0
+    power_labels = {0 : '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
+    while abs(size) > power:
+        size /= power
+        n += 1
+    return f'{size:.4g} {power_labels[n]+"B"}'
+
+
 def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
     assert len(x) <= max_len
     return x + [pad] * (max_len - len(x))
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 27d1727cd16a3..eb21a31842533 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,10 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available, is_hpu
+
+if is_hpu():
+    import habana_frameworks.torch as htorch
 
 logger = init_logger(__name__)
 
@@ -46,7 +49,7 @@ def __init__(
         self.attn_backend = get_attn_backend(model_config.dtype)
 
         # Initialize the cache.
-        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
+        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "hpu" if is_hpu() else "cuda")
         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
 
     def _allocate_kv_cache(
@@ -60,11 +63,21 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
         for _ in range(self.num_layers):
-            kv_cache.append(
-                torch.empty(kv_cache_shape,
-                            dtype=self.dtype,
-                            pin_memory=pin_memory,
-                            device=device))
+            if device == 'hpu':
+                key_cache = torch.zeros(kv_cache_shape,
+                        dtype=self.dtype,
+                        device=device)
+                value_cache = torch.zeros(kv_cache_shape,
+                        dtype=self.dtype,
+                        device=device)
+                kv_layer = (key_cache, value_cache)
+                kv_cache.append(kv_layer)
+            else:
+                kv_layer = torch.empty(kv_cache_shape,
+                                        dtype=self.dtype,
+                                        pin_memory=pin_memory,
+                                        device=device)
+                kv_cache.append(kv_layer)
         return kv_cache
 
     def swap_in(self, src_to_dst: Dict[int, int]) -> None:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
new file mode 100644
index 0000000000000..7ced639a7fb03
--- /dev/null
+++ b/vllm/worker/habana_model_runner.py
@@ -0,0 +1,1168 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import contextlib
+import time
+from typing import Dict, List, Optional, Set, Tuple
+
+# for logging hpugraph capture
+import tqdm
+import pandas as pd
+import tabulate
+
+import os
+import contextlib
+import math
+import itertools
+import numpy as np
+import torch
+import torch.nn as nn
+import habana_frameworks.torch as htorch
+from habana_frameworks.torch.hpu.metrics import metric_localcontext
+
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.parallel_utils import cupy_utils, custom_all_reduce
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    with_cupy_nccl_for_all_reduce)
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d,
+                        is_pin_memory_available, make_tensor_with_pad,
+                        maybe_expand_dim, pad_to_max_length, format_bytes)
+
+logger = init_logger(__name__)
+
+_PAD_SLOT_ID = -1
+LORA_WARMUP_RANK = 8
+_BATCH_SIZE_ALIGNMENT = 16
+# Capture graphs for token size 1, 2, 4, 8, 16, 32, 48, ..., 512.
+# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4, 8] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+]
+
+# Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048
+_MAX_CONTEXT_LEN_ALIGNMENT = 256
+_MAX_CONTEXT_LENS_TO_CAPTURE = [1, 32, 64, 128] + [
+    _MAX_CONTEXT_LEN_ALIGNMENT * i for i in range(1, 9)
+]
+
+
+class HabanaModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.lora_config = lora_config
+        self.is_driver_worker = is_driver_worker
+
+        # model_config can be None in tests/samplers/test_sampler.py.
+        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+        self.sliding_window = (model_config.get_sliding_window()
+                               if model_config is not None else None)
+        self.device_config = (device_config
+                              if device_config is not None else DeviceConfig())
+        self.device = self.device_config.device
+
+        self.model = None
+        self.block_size = None  # Set after initial profiling.
+        self.lora_manager = None
+        self.graph_runner_class = HPUGraphRunner
+        self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {}
+
+        self.max_context_len_to_capture = (
+            self.model_config.max_context_len_to_capture
+            if self.model_config is not None else 0)
+        # When using CUDA graph, the input block tables must be padded to
+        # max_context_len_to_capture. However, creating the block table in
+        # Python can be expensive. To optimize this, we cache the block table
+        # in numpy and only copy the actual input content at every iteration.
+        # The shape of the cached block table will be
+        # (max batch size to capture, max context len to capture / block size).
+        self.graph_block_tables = None  # Set after initial profiling.
+        self.pin_memory = is_pin_memory_available()
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.dtype if model_config is not None else None)
+
+    def load_model(self) -> None:
+        with HabanaMemoryProfiler() as m:
+            self.model = get_model(self.model_config,
+                                   self.device_config,
+                                   lora_config=self.lora_config,
+                                   parallel_config=self.parallel_config,
+                                   scheduler_config=self.scheduler_config)
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(f"Loading model weights took "
+                    f"{format_bytes(self.model_memory_usage)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+
+        if self.lora_config:
+            assert hasattr(self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
+                               "Model does not support LoRA")
+            assert hasattr(
+                self.model,
+                "embedding_modules"), "Model does not have embedding_modules"
+            assert hasattr(self.model, "embedding_padding_modules"
+                           ), "Model does not have embedding_padding_modules"
+            self.lora_manager = LRUCacheWorkerLoRAManager(
+                self.scheduler_config.max_num_seqs,
+                self.scheduler_config.max_num_batched_tokens, self.vocab_size,
+                self.lora_config, self.device, self.model.embedding_modules,
+                self.model.embedding_padding_modules)
+            self.model = self.lora_manager.create_lora_manager(self.model)
+
+    def set_block_size(self, block_size: int) -> None:
+        self.block_size = block_size
+
+        self.graph_block_tables = np.zeros(
+            (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
+            dtype=np.int32)
+
+    def get_max_block_per_batch(self) -> int:
+        block_size = self.block_size
+        return (self.max_context_len_to_capture + block_size - 1) // block_size
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               List[int], List[int], List[int], Set[LoRARequest]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        lora_index_mapping: List[List[int]] = []
+        lora_prompt_mapping: List[List[int]] = []
+        lora_requests: Set[LoRARequest] = set()
+
+        prompt_lens: List[int] = []
+        context_lens: List[int] = []
+        subquery_lens: List[int] = []
+        prefix_block_tables: List[List[int]] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            prompt_len = len(prompt_tokens)
+            prompt_lens.append(prompt_len)
+            computed_len = 0
+
+            # NOTE: This only works for oooooooxxx style attention.
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0 and self.sliding_window is None:
+                # Prefix is not supported with sliding_window
+                computed_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[computed_len:]
+                prefix_block_tables.append(computed_block_nums)
+                context_len = computed_len
+            else:
+                prefix_block_tables.append([])
+                context_len = 0
+            # actual prompt lens
+            context_lens.append(context_len)
+            if computed_len != 0:
+                import pdb; pdb.set_trace() # what happens if we hit that path??
+            subquery_lens.append(prompt_len - computed_len)
+
+            input_tokens.append(prompt_tokens)
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.append(
+                list(range(computed_len, computed_len + len(prompt_tokens))))
+
+            lora_id = seq_group_metadata.lora_int_id
+
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+
+            lora_index_mapping += [lora_id] * (prompt_len - computed_len)
+            lora_prompt_mapping.append(
+                [lora_id] *
+                (prompt_len - computed_len
+                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.append([_PAD_SLOT_ID] * prompt_len)
+                continue
+
+            # Compute the slot mapping.
+            slot_mapping.append([])
+            block_table = seq_group_metadata.block_tables[seq_id]
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, prompt_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                assert computed_len == 0, (
+                    "Prefix caching is currently not supported with "
+                    "sliding window attention")
+                start_idx = max(0, prompt_len - self.sliding_window)
+            for i in range(computed_len, prompt_len):
+                if i < start_idx:
+                    slot_mapping[-1].append(_PAD_SLOT_ID)
+                    continue
+
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping[-1].append(slot)
+
+        max_subquery_len = max(subquery_lens)
+        max_prompt_len = max(prompt_lens)
+        num_prompt_tokens = len(input_tokens)
+        assert max_subquery_len > 0
+       
+        lora_index_mapping = lora_index_mapping
+
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device=self.device)
+        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
+        max_prompt_len = max(prompt_lens)
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                             max_prompt_len,
+                                             pad=0,
+                                             dtype=torch.long,
+                                             device=self.device)
+        
+        input_positions = make_tensor_with_pad(input_positions,
+                                                max_prompt_len,
+                                                pad=0,
+                                                dtype=torch.long,
+                                                device=self.device)
+        
+        slot_mapping = make_tensor_with_pad(slot_mapping,
+                                             max_prompt_len,
+                                             pad=_PAD_SLOT_ID,
+                                             dtype=torch.long,
+                                             device=self.device)
+
+        # Prepare prefix block tables
+        block_tables = make_tensor_with_pad(
+            prefix_block_tables,
+            max_len=max_prompt_block_table_len,
+            pad=0,
+            dtype=torch.int,
+            device=self.device,
+        )
+
+        # Query length can be shorter than key (i.e., prompt) when prefill
+        # is chunked or prefix cached.
+        subquery_lens_tensor = torch.tensor(subquery_lens,
+                                            dtype=torch.long,
+                                            device=self.device)
+        subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
+                                         dtype=torch.int32,
+                                         device=self.device)
+
+        prompt_lens_tensor = torch.tensor(prompt_lens,
+                                          dtype=torch.long,
+                                          device=self.device)
+        seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=self.device)
+
+        torch.cumsum(subquery_lens_tensor,
+                     dim=0,
+                     dtype=subquery_start_loc.dtype,
+                     out=subquery_start_loc[1:])
+
+        torch.cumsum(prompt_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=True,
+            slot_mapping=slot_mapping,
+            prompt_lens=prompt_lens,
+            prompt_lens_tensor=prompt_lens_tensor,
+            num_prompt_tokens=num_prompt_tokens,
+            num_generation_tokens=0,
+            max_subquery_len=max_subquery_len,
+            max_context_len=None,
+            max_prompt_len=max_prompt_len,
+            subquery_start_loc=subquery_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+        return (input_tokens, input_positions, attn_metadata, prompt_lens,
+                subquery_lens, lora_index_mapping, lora_prompt_mapping,
+                lora_requests)
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               List[int], Set[LoRARequest]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        context_lens: List[int] = []
+        block_tables: List[List[int]] = []
+        lora_index_mapping: List[int] = []
+        lora_prompt_mapping: List[int] = []
+        lora_requests: Set[LoRARequest] = set()
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            lora_id = seq_group_metadata.lora_int_id
+
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+
+                context_len = seq_len if self.sliding_window is None else min(
+                    seq_len, self.sliding_window)
+                context_lens.append(context_len)
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append([slot])
+                lora_index_mapping.append(lora_id)
+                lora_prompt_mapping.append(lora_id)
+
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+
+        # vLLM uses cuda graph only for decoding requests.
+        # See `capture_model` API for more details.
+        # For decoding requests, batch_size == input_tokens.
+        batch_size = len(input_tokens)
+        max_context_len = max(context_lens)
+        use_captured_graph = (
+            not self.model_config.enforce_eager
+            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+            and max_context_len <= self.max_context_len_to_capture)
+        if use_captured_graph:
+            graph_batch_size = _get_graph_batch_size(batch_size)
+            assert graph_batch_size >= batch_size
+            for _ in range(graph_batch_size - batch_size):
+                input_tokens.append([0])
+                input_positions.append([0])
+                slot_mapping.append([_PAD_SLOT_ID])
+                context_lens.append(1)
+                block_tables.append([])
+                lora_index_mapping.append(0)
+            batch_size = graph_batch_size
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int,
+                                    device=self.device)
+
+        if use_captured_graph:
+            # When using cuda-graph all these tensors should be
+            # padded.
+            assert context_lens.shape[0] == input_tokens.shape[0]
+            assert context_lens.shape[0] == input_positions.shape[0]
+            assert context_lens.shape[0] == slot_mapping.shape[0]
+
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            graph_max_context_len  = _get_graph_max_context_len(max_context_len)
+            assert graph_max_context_len >= max_context_len
+            graph_block_count = math.ceil(graph_max_context_len / self.block_size)
+            input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count]
+            
+            for i, block_table in enumerate(block_tables):
+                if block_table:
+                    input_block_tables[i, :len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device=self.device)
+        else:
+            max_block_table_len = max(
+                len(block_table) for block_table in block_tables)
+            block_tables = make_tensor_with_pad(
+                block_tables,
+                max_len=max_block_table_len,
+                pad=0,
+                dtype=torch.int,
+                device=self.device,
+            )
+
+        attn_metadata = self.attn_backend.make_metadata(
+            is_prompt=False,
+            slot_mapping=slot_mapping,
+            prompt_lens=None,
+            prompt_lens_tensor=None,
+            num_prompt_tokens=0,
+            num_generation_tokens=len(input_tokens),
+            max_subquery_len=None,
+            max_context_len=max_context_len,
+            max_prompt_len=None,
+            subquery_start_loc=None,
+            seq_start_loc=None,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+        return (input_tokens, input_positions, attn_metadata,
+                lora_index_mapping, lora_prompt_mapping, lora_requests)
+
+    def _prepare_sample(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+        subquery_lens: Optional[List[int]],
+    ) -> SamplingMetadata:
+        seq_groups: List[Tuple[List[int], SamplingParams]] = []
+        selected_token_indices: List[int] = []
+        generators: List[torch.Generator] = []
+        selected_token_start_idx = 0
+        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices_start_idx = 0
+        categorized_sampled_token_indices_start_idx = 0
+        max_subquery_len = max(subquery_lens) if subquery_lens else 1
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+
+            if seq_group_metadata.is_prompt:
+                assert len(seq_ids) == 1
+                assert subquery_lens is not None
+                subquery_len = subquery_lens[i]
+                if sampling_params.prompt_logprobs is not None:
+                    # NOTE: prompt token positions do not need sample, skip
+                    categorized_sample_indices_start_idx += subquery_len - 1
+
+                categorized_sample_indices[
+                    sampling_params.sampling_type].append([
+                        categorized_sample_indices_start_idx,
+                        categorized_sampled_token_indices_start_idx
+                    ])
+                categorized_sample_indices_start_idx += 1
+                categorized_sampled_token_indices_start_idx += 1
+
+                if sampling_params.prompt_logprobs is not None:
+                    selected_token_indices.extend(
+                        range(selected_token_start_idx,
+                              selected_token_start_idx + subquery_len - 1))
+                selected_token_indices.append(selected_token_start_idx +
+                                              subquery_len - 1)
+                selected_token_start_idx += max_subquery_len
+
+                if sampling_params.seed is not None:
+                    seq_group_metadata.state.generator = torch.Generator(
+                        device=self.device).manual_seed(sampling_params.seed)
+            else:
+                num_seqs = len(seq_ids)
+                selected_token_indices.extend(
+                    range(selected_token_start_idx,
+                          selected_token_start_idx + num_seqs))
+                selected_token_start_idx += num_seqs
+
+                categorized_sample_indices[
+                    sampling_params.sampling_type].extend(
+                        zip(
+                            range(
+                                categorized_sample_indices_start_idx,
+                                categorized_sample_indices_start_idx +
+                                num_seqs),
+                            range(
+                                categorized_sampled_token_indices_start_idx,
+                                categorized_sampled_token_indices_start_idx +
+                                num_seqs)))
+                categorized_sample_indices_start_idx += num_seqs
+                categorized_sampled_token_indices_start_idx += num_seqs
+
+            if sampling_params.seed is not None:
+                generators.append(seq_group_metadata.state.generator)
+
+        selected_token_indices = async_tensor_h2d(selected_token_indices,
+                                                  dtype=torch.long,
+                                                  target_device=self.device,
+                                                  pin_memory=self.pin_memory)
+
+        categorized_sample_indices = {
+            t: maybe_expand_dim(
+                async_tensor_h2d(seq_ids,
+                                 dtype=torch.int,
+                                 target_device=self.device,
+                                 pin_memory=self.pin_memory), 2, 2)
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        sampling_metadata = SamplingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+            generators=generators,
+        )
+        return sampling_metadata
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Set[int], LoRAMapping]:
+        if self.is_driver_worker:
+            # NOTE: We assume that all sequences in the group are all prompts or
+            # all decodes.
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            # Prepare input tensors.
+            if is_prompt:
+                (input_tokens, input_positions, attn_metadata, prompt_lens,
+                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
+                 lora_requests) = self._prepare_prompt(seq_group_metadata_list)
+            else:
+                (input_tokens, input_positions, attn_metadata,
+                 lora_index_mapping, lora_prompt_mapping,
+                 lora_requests) = self._prepare_decode(seq_group_metadata_list)
+                prompt_lens = []
+                subquery_lens = None
+            sampling_metadata = self._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens,
+                                                     subquery_lens)
+
+            if self.lora_config:
+                lora_mapping = LoRAMapping(
+                    lora_index_mapping,
+                    lora_prompt_mapping,
+                )
+            else:
+                lora_mapping = None
+
+            # Broadcast the metadata.
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "selected_token_indices":
+                sampling_metadata.selected_token_indices,
+                "lora_requests": lora_requests,
+                "lora_mapping": lora_mapping,
+            }
+            metadata_dict.update(attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            lora_mapping = metadata_dict.pop("lora_mapping")
+            lora_requests = metadata_dict.pop("lora_requests")
+            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                seq_data=None,
+                prompt_lens=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                generators=None,
+                perform_sampling=False,
+            )
+
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, lora_requests, lora_mapping)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        kv_caches: List[torch.Tensor],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         lora_requests,
+         lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        if self.lora_config:
+            self.set_active_loras(lora_requests, lora_mapping)
+
+        # Execute the model.
+        if attn_metadata.use_cuda_graph:
+            graph_batch_size = input_tokens.shape[0]
+            graph_block_count = attn_metadata.block_tables.shape[1] 
+            graph_runner_key = (graph_batch_size, graph_block_count)
+            model_executable = self.graph_runners[graph_runner_key]
+            logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})")
+        else:
+            model_executable = self.model
+        hidden_states = model_executable(
+            input_ids=input_tokens,
+            positions=input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+         # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not sampling_metadata.perform_sampling:
+            return None
+        
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests = []
+        dummy_lora_requests_per_seq = []
+        if self.lora_config:
+            for idx in range(self.lora_config.max_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_local_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+            dummy_lora_requests_per_seq = [
+                dummy_lora_requests[idx % len(dummy_lora_requests)]
+                for idx in range(max_num_seqs)
+            ]
+
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_data = SequenceData([0] * seq_len)
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
+            )
+            seqs.append(seq)
+
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [None] * num_layers
+        self.execute_model(seqs, kv_caches)
+        torch.hpu.synchronize()
+        return
+
+    def remove_all_loras(self) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_all_loras()
+
+    def set_active_loras(self, lora_requests: List[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_loras()
+
+    @torch.inference_mode()
+    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
+        """Cuda graph capture a model.
+
+        Note that CUDA graph's performance gain is negligible if number
+        of batched tokens are larger than 200. And since CUDA graph
+        requires fixed sized tensors, supporting large/variable batch
+        size requires high GPU memory overhead. Thus, vLLM only captures
+        decoding requests. Mixed batch (chunked prefill + decoding) or
+        prefill requests are not captured.
+
+        Since it is used for decoding-only, it assumes there's only 1 token
+        per sequence in the batch.
+        """
+        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
+        # deleted before the CUDA graphs.
+        self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
+
+        assert not self.model_config.enforce_eager
+        logger.info("Capturing the model for HPUGraphs. This may lead to "
+                    "unexpected consequences if the model is not static. To "
+                    "run the model in eager mode, set 'enforce_eager=True' or "
+                    "use '--enforce-eager' in the CLI.")
+        logger.info("HPUGraphs can take additional ~10 GiB memory per HPU. "
+                    "If you are running out of memory, consider decreasing "
+                    "`gpu_memory_utilization` or enforcing eager mode. "
+                    "You can also reduce the `max_num_seqs` as needed "
+                    "to decrease memory usage.")
+        start_time = time.perf_counter()
+
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu')
+        input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu')
+        slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') # TODO(kzawora): when using torch.empty, following occurs: RuntimeError: Error when trying to cast Long to Int, Input values range [0, 139632108750000] exceeds Int range [-2147483648, 2147483647]
+        slot_mapping.fill_(_PAD_SLOT_ID)
+        context_lens = torch.ones(max_batch_size, dtype=torch.int32).to('hpu')
+        block_tables = torch.from_numpy(self.graph_block_tables).to('hpu')
+
+        graph_batch_size = _get_graph_batch_size(
+            self.scheduler_config.max_num_seqs)
+        batch_size_capture_list = [
+            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
+        ]
+
+        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
+        # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
+        # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
+        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
+        # We always prioritize using custom all-reduce kernel but fall back
+        # to PyTorch or CuPy NCCL if it is disabled or not supported.
+        with custom_all_reduce.capture():
+            # NOTE: Capturing the largest batch size first may help reduce the
+            # memory usage of CUDA graph.
+            valid_combinations = []
+            total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_CONTEXT_LENS_TO_CAPTURE)
+            import pandas as pd
+            df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_CONTEXT_LENS_TO_CAPTURE)
+            for idx, (batch_size, max_context_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_CONTEXT_LENS_TO_CAPTURE))): 
+                block_count = math.ceil(max_context_len / self.block_size)
+                # Skip capture of "out-of-bound" batch sizes and context lengths
+                if batch_size > self.scheduler_config.max_num_seqs:
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Batch out of bound.")
+                    df[max_context_len][batch_size] = 'batch OoB'
+                    continue 
+                if max_context_len > self.max_context_len_to_capture:
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Nax context length out of bound.")
+                    df[max_context_len][batch_size] = 'ctx OoB'
+                    continue
+                block_count = math.ceil(max_context_len / self.block_size)
+                captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size]
+                if block_count in captured_block_counts:
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Block size already captured.")
+                    df[max_context_len][batch_size] = 'redundant'
+                    continue
+                logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Constraints met.")
+                df[max_context_len][batch_size] = 'VALID'
+                valid_combinations.append((batch_size, max_context_len))
+
+            total_valid_hpugraphs = len(valid_combinations)
+            logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.")
+            logger.debug(f"Capture summary (row: batch_size; col: max_context_len):")
+            logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always"))
+
+            graph_runner_name = self.graph_runner_class.__name__
+            graph_mem_usage_df = pd.DataFrame(index=list(reversed(sorted({b for b,c in valid_combinations}))), columns=list(reversed(sorted({c for b,c in valid_combinations}))))
+            pbar = tqdm.tqdm(valid_combinations)
+            start_mem = HabanaMemoryProfiler.current_memory_usage()
+            log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+            log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all
+        
+            for idx, (batch_size, max_context_len) in enumerate(pbar): 
+                block_count = math.ceil(max_context_len / self.block_size)
+                # Create dummy attn_metadata.
+                attn_metadata = self.attn_backend.make_metadata(
+                    is_prompt=False,
+                    slot_mapping=slot_mapping[:batch_size],
+                    prompt_lens=None,
+                    prompt_lens_tensor=None,
+                    num_prompt_tokens=0,
+                    num_generation_tokens=batch_size,
+                    max_subquery_len=None,
+                    max_context_len=block_count*self.block_size,
+                    max_prompt_len=None,
+                    subquery_start_loc=None,
+                    seq_start_loc=None,
+                    context_lens=context_lens[:batch_size],
+                    block_tables=block_tables[:batch_size, :block_count],
+                    use_cuda_graph=True,
+                    kv_cache_dtype=self.kv_cache_dtype,
+                )
+
+                if self.lora_config:
+                    lora_mapping = LoRAMapping(
+                        [0] * batch_size,
+                        [0] * batch_size,
+                    )
+                    self.set_active_loras(set(), lora_mapping)
+                graph_runner = self.graph_runner_class(self.model)
+                local_start_mem = HabanaMemoryProfiler.current_memory_usage()
+                capture_start = time.time()
+                desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)'
+                pbar.set_description(desc)
+                logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...")
+                profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation")
+                with profiling_ctx as gc_local_metric:
+                    graph_runner.capture(
+                        input_tokens[:batch_size],
+                        input_positions[:batch_size],
+                        kv_caches,
+                        attn_metadata,
+                    )
+                if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
+                    logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}")
+                self.graph_runners[(batch_size, block_count)] = graph_runner
+                capture_end = time.time()
+                local_end_mem = HabanaMemoryProfiler.current_memory_usage()
+                mem_usage_str = format_bytes(local_end_mem - local_start_mem)
+                graph_mem_usage_df[max_context_len][batch_size] = mem_usage_str
+                logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+
+        end_time = time.perf_counter()
+        elapsed_time = end_time - start_time
+        # This usually takes < 10 seconds.
+        end_mem = HabanaMemoryProfiler.current_memory_usage()
+        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+        logger.info(f"Graph memory allocation summary (row: batch_size; col: max_context_len):")
+        logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always"))
+
+    def __del__(self) -> None:
+        # Delete the CUDA graphs before deleting the CuPy NCCL communicator.
+        # NOTE(woosuk): This is necessary because otherwise deadlocks can
+        # happen.
+        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
+        self.graph_runners.clear()
+        self.cupy_nccl_backend = None
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
+
+class FakeHPUGraphRunner:
+
+    def __init__(self, model: nn.Module):
+        self.model = model
+
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> None:
+        return
+    
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+        )
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+class FakeHPUGraphRunnerWithWarmup:
+
+    def __init__(self, model: nn.Module):
+        self.model = model
+
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> None:
+        htorch.core.mark_step()
+        out = self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+        )
+        htorch.core.mark_step()
+        htorch.hpu.synchronize()
+        return
+    
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        htorch.core.mark_step()
+        out = self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+        )
+        htorch.core.mark_step()
+        return out
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+class HPUGraphRunner:
+
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.graph = None
+        self.input_buffers: Dict[str, torch.Tensor] = {}
+        self.output_buffers: Dict[str, torch.Tensor] = {}
+
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> None:
+        assert self.graph is None
+        # Run the model once without capturing the graph.
+        # This is to make sure that the captured graph does not include the
+        # kernel launches for initial benchmarking (e.g., Triton autotune).
+        self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+        )
+        htorch.hpu.synchronize()
+
+        # Capture the graph.
+        # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
+        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
+        self.graph = htorch.hpu.HPUGraph()
+        with htorch.hpu.graph(self.graph):  # noqa: SIM117
+            hidden_states = self.model(
+                input_ids,
+                positions,
+                kv_caches,
+                attn_metadata,
+            )
+        torch.hpu.synchronize()
+
+        # Save the input and output buffers.
+        self.input_buffers = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "kv_caches": kv_caches,
+            "slot_mapping": attn_metadata.slot_mapping,
+            "context_lens": attn_metadata.context_lens,
+            "block_tables": attn_metadata.block_tables,
+        }
+        self.output_buffers = {"hidden_states": hidden_states}
+        return
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # KV caches are fixed tensors, so we don't need to copy them.
+        del kv_caches
+
+        # Copy the input tensors to the input buffers.
+        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
+        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
+                                                 non_blocking=True)
+        self.input_buffers["context_lens"].copy_(attn_metadata.context_lens,
+                                                 non_blocking=True)
+        self.input_buffers["block_tables"].copy_(attn_metadata.block_tables,
+                                                 non_blocking=True)
+        # Run the graph.
+        self.graph.replay()
+
+        # Return the output tensor.
+        return self.output_buffers["hidden_states"]
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+class ExperimentalHPUGraphRunner:
+    def __init__(self, model: nn.Module):
+        self.model = model
+
+    def capture(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> None:
+        class ModelWrapper(torch.nn.Module):
+            def __init__(self, model):
+                super().__init__()
+                self.model = model
+                self.attn_backend = get_attn_backend(torch.bfloat16)
+            def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables):
+                wrapper_attn_metadata = self.attn_backend.make_metadata(
+                    is_prompt=attn_metadata.is_prompt,
+                    slot_mapping=slot_mapping,
+                    prompt_lens=None,
+                    prompt_lens_tensor=None,
+                    num_prompt_tokens=0,
+                    num_generation_tokens=attn_metadata.num_generation_tokens,
+                    max_subquery_len=None,
+                    max_context_len=attn_metadata.max_context_len,
+                    max_prompt_len=None,
+                    subquery_start_loc=None,
+                    seq_start_loc=None,
+                    context_lens=context_lens,
+                    block_tables=block_tables,
+                    use_cuda_graph=True,
+                    kv_cache_dtype=attn_metadata.kv_cache_dtype,
+                )
+                return self.model(
+                    input_ids,
+                    positions,
+                    kv_caches,
+                    wrapper_attn_metadata
+                )
+        self.graph_model = htorch.hpu.wrap_in_hpu_graph(ModelWrapper(self.model))
+        out = self.graph_model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata.slot_mapping,
+            attn_metadata.context_lens, 
+            attn_metadata.block_tables,
+        )
+        htorch.hpu.synchronize()
+        return
+    
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        out = self.graph_model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata.slot_mapping,
+            attn_metadata.context_lens, 
+            attn_metadata.block_tables,
+        )
+        return out
+
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+def _get_graph_batch_size(batch_size: int) -> int:
+    """Returns the padded batch size given actual batch size.
+
+    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+    """
+    if batch_size <= 2:
+        return batch_size
+    elif batch_size <= 4:
+        return 4
+    elif batch_size <= 8:
+        return 8
+    else:
+        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
+                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
+
+
+def _get_graph_max_context_len(max_context_len: int) -> int:
+    """Returns the padded batch size given actual batch size.
+
+    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+    """
+    if max_context_len <= 32:
+        return 32
+    elif max_context_len <= 64:
+        return 64
+    elif max_context_len <= 128:
+        return 128
+    else:
+        return ((max_context_len + _MAX_CONTEXT_LEN_ALIGNMENT - 1) //
+                _MAX_CONTEXT_LEN_ALIGNMENT * _MAX_CONTEXT_LEN_ALIGNMENT)
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
new file mode 100644
index 0000000000000..bbfd7dad7f90a
--- /dev/null
+++ b/vllm/worker/habana_worker.py
@@ -0,0 +1,263 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import gc
+import os
+from typing import Dict, List, Optional, Set, Tuple
+
+import torch
+import habana_frameworks.torch as htorch
+import torch.distributed
+
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig)
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
+from vllm.model_executor.parallel_utils.parallel_state import (
+    ensure_model_parallel_initialized)
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.habana_model_runner import HabanaModelRunner
+
+
+class HabanaWorker:
+    """A worker class that executes (a partition of) the model on a HPU.
+
+    Each worker is associated with a single HPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the HPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        self.model_runner = HabanaModelRunner(model_config,
+                                        parallel_config,
+                                        scheduler_config,
+                                        device_config,
+                                        lora_config=self.lora_config,
+                                        kv_cache_dtype=kv_cache_dtype,
+                                        is_driver_worker=is_driver_worker)
+        # Uninitialized cache engine. Will be initialized by
+        # self.init_cache_engine().
+        self.cache_config = None
+        self.cache_engine = None
+        self.hpu_cache = None
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "hpu":
+            self.device = torch.device("hpu")
+            torch.hpu.set_device(self.device)
+            self.init_hpu_memory = torch.hpu.mem_get_info()[0]
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_distributed_environment(self.parallel_config, self.rank,
+                                     self.distributed_init_method)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    @torch.inference_mode()
+    def profile_num_available_blocks(
+        self,
+        block_size: int,
+        hpu_memory_utilization: float,
+        cpu_swap_space: int,
+        cache_dtype: str,
+    ) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model and returns the maximum
+        number of HPU and CPU cache blocks that can be allocated.
+
+        Args:
+            block_size: The size of the cache block.
+            hpu_memory_utilization: The fraction of the total HPU memory to use.
+            cpu_swap_space: The size of the CPU swap space in bytes.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.hpu.synchronize()
+        free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # HPU did not change their memory usage during the profiling.
+        peak_memory = self.init_hpu_memory - free_hpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. This happens when the hpu memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        cache_block_size = self.get_cache_block_size_bytes(
+            block_size, cache_dtype)
+        num_hpu_blocks = int(
+            (total_hpu_memory * hpu_memory_utilization - peak_memory) //
+            cache_block_size)
+        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+        num_hpu_blocks = max(num_hpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
+        gc.collect()
+        return num_hpu_blocks, num_cpu_blocks
+
+    def init_cache_engine(self, cache_config: CacheConfig) -> None:
+        self.cache_config = cache_config
+        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+                                        self.parallel_config)
+        self.hpu_cache = self.cache_engine.gpu_cache
+        self.model_runner.set_block_size(self.cache_engine.block_size)
+        htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution
+
+    def warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.hpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    def cache_swap(
+        self,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        # Issue cache operations.
+        # TODO(woosuk): Profile swapping overhead and optimize if needed.
+        if blocks_to_swap_in:
+            self.cache_engine.swap_in(blocks_to_swap_in)
+        if blocks_to_swap_out:
+            self.cache_engine.swap_out(blocks_to_swap_out)
+        if blocks_to_copy:
+            self.cache_engine.copy(blocks_to_copy)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
+        blocks_to_swap_in: Optional[Dict[int, int]] = None,
+        blocks_to_swap_out: Optional[Dict[int, int]] = None,
+        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+    ) -> Optional[SamplerOutput]:
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups = len(seq_group_metadata_list)
+            assert blocks_to_swap_in is not None
+            assert blocks_to_swap_out is not None
+            assert blocks_to_copy is not None
+            data = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_swap_in = data["blocks_to_swap_in"]
+            blocks_to_swap_out = data["blocks_to_swap_out"]
+            blocks_to_copy = data["blocks_to_copy"]
+
+        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return {}
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.hpu_cache)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self, block_size: int,
+                                   cache_dtype: str) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
+                                                self.model_config,
+                                                self.parallel_config)
+
+
+def init_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+) -> None:
+    """Initialize the distributed environment."""
+    if torch.distributed.is_initialized():
+        torch_world_size = torch.distributed.get_world_size()
+        if torch_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "torch.distributed is already initialized but the torch world "
+                "size does not match parallel_config.world_size "
+                f"({torch_world_size} vs. {parallel_config.world_size}).")
+    elif not distributed_init_method:
+        raise ValueError(
+            "distributed_init_method must be set if torch.distributed "
+            "is not already initialized")
+    else:
+        torch.distributed.init_process_group(
+            backend="hccl",
+            world_size=parallel_config.world_size,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+
+    # A small all_reduce for warmup.
+    torch.distributed.all_reduce(torch.zeros(1).to('hpu'))
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+    # Initialize a custom fast all-reduce implementation.
+    if not parallel_config.disable_custom_all_reduce:
+        init_custom_ar()

From 6963277d4364d7d98b354b44ddc5978bb9e85786 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 8 May 2024 20:15:22 +0300
Subject: [PATCH 002/341] adapt habana components to changed vllm apis

---
 vllm/attention/backends/habana_attn.py        | 139 ++---
 vllm/attention/ops/habana_paged_attn.py       |  31 +-
 vllm/executor/habana_executor.py              | 182 ++----
 .../model_executor/layers/logits_processor.py |   2 +-
 vllm/worker/habana_model_runner.py            | 565 +++++++++++++-----
 vllm/worker/habana_worker.py                  | 174 ++++--
 6 files changed, 669 insertions(+), 424 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 844dc92b315ac..909c2ad955f25 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -13,7 +13,8 @@
                                 LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata,
+                                              AttentionMetadataPerStage)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -40,7 +41,7 @@ def get_kv_cache_shape(
         head_size: int,
     ) -> Tuple[int, ...]:
         return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
+                                                       num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -59,7 +60,7 @@ def copy_blocks(
 
 
 @dataclass
-class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
+class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMetadata):
     """Metadata for HabanaAttentionbackend.
 
     NOTE: Any python object stored here is not updated when it is
@@ -70,37 +71,24 @@ class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
-    slot_mapping: torch.Tensor
-    # (batch_size,). The prompt length per sequence. None if it is a decoding.
-    prompt_lens: Optional[List[int]]
-    # prompt_lens stored as a tensor.
-    prompt_lens_tensor: Optional[torch.Tensor]
-    # The number of prompt tokens. Doesn't include padding.
-    num_prompt_tokens: int
-    # The number of generation tokens. Doesn't include padding.
-    num_generation_tokens: int
-
-    # NOTE(sang): Definition of context_len, subquery_len, and seqlen.
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
     # |- tokenA -|......................|-- newTokens ---|
     # |---------- context_len ----------|
-    # |-------------------- seqlen ----------------------|
-    #                                   |- subquery_len -|
-
-    # WARNING(sang): context_len has different definition depending on if it is
-    # prefill vs decoding. When it is prefill, it doesn't include new tokens.
-    # When it is for decoding, it includes a new token.
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
 
-    # Maximum subquery length in the batch.
-    max_subquery_len: Optional[int]
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
     # FIXME: It is for flash attn.
-    # Maximum prompt length in the batch.
-    max_prompt_len: Optional[int]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
@@ -110,6 +98,9 @@ class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
     # the batch, used to index into sequence. E.g., if the sequence length is
     # [4, 6], it is [0, 4, 10].
     seq_start_loc: Optional[torch.Tensor]
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
 
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
@@ -128,12 +119,12 @@ def __post_init__(self):
 class HabanaAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prompt_tokens --------------->|	
-    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1--->|
+    |<--------------- num_prefill_tokens ----------------->|	
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
 
     Otherwise, the layout is as follows:	
-    |<------------------ num_generation_tokens (M) ----------------->|	
-    |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->|
+    |<----------------- num_decode_tokens ------------------>|	
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
 
     Generation tokens can contain padding when cuda-graph is used.
     Currently, prompt tokens don't contain any padding.
@@ -175,7 +166,8 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: HabanaAttentionMetadata,
+        attn_metadata: AttentionMetadata[HabanaAttentionMetadata],
+        kv_scale: float,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
@@ -191,7 +183,6 @@ def forward(
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
-
         query = query.view(-1, self.num_heads, self.head_size)
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
@@ -203,14 +194,14 @@ def forward(
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
             HabanaPagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                value_cache,
-                                                attn_metadata.slot_mapping,
-                                                attn_metadata.kv_cache_dtype, 
-                                                attn_metadata.is_prompt)
+                                                      value_cache,
+                                                      attn_metadata.slot_mapping,
+                                                      attn_metadata.kv_cache_dtype, 
+                                                      attn_metadata.prefill_metadata is not None)
 
-        if attn_metadata.is_prompt:
+        if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
-            if kv_cache is None or attn_metadata.block_tables.numel() == 0:
+            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # normal attention.
                 # block tables are empty if the prompt does not have a cached
                 # prefix.
@@ -232,16 +223,16 @@ def forward(
                                                   self.num_queries_per_kv,
                                                   value.shape[-1])
 
-                if attn_metadata.attn_bias is None:
+                if prefill_meta.attn_bias is None:
                     if self.alibi_slopes is None:
                         attn_bias = BlockDiagonalCausalMask.from_seqlens(
                             [seq_len] * batch_size)
                         if self.sliding_window is not None:
                             attn_bias = attn_bias.make_local_attention(
                                 self.sliding_window)
-                        attn_metadata.attn_bias = attn_bias
+                        prefill_meta.attn_bias = attn_bias
                     else:
-                        attn_metadata.attn_bias = _make_alibi_bias(
+                        prefill_meta.attn_bias = _make_alibi_bias(
                             self.alibi_slopes, self.num_kv_heads, batch_size,
                             seq_len, query.dtype)
                 query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size)
@@ -250,7 +241,7 @@ def forward(
                     query.view(query_shape),
                     key.view(kv_shape),
                     value.view(kv_shape),
-                    attn_bias=attn_metadata.attn_bias,
+                    attn_bias=prefill_meta.attn_bias,
                     p=0.0,
                     scale=self.scale,
                 )
@@ -263,26 +254,27 @@ def forward(
                     value,
                     key_cache,
                     value_cache,
-                    attn_metadata.block_tables,
-                    attn_metadata.subquery_start_loc,
-                    attn_metadata.prompt_lens_tensor,
-                    attn_metadata.context_lens,
-                    attn_metadata.max_subquery_len,
+                    prefill_meta.block_tables,
+                    prefill_meta.subquery_start_loc,
+                    prefill_meta.seq_lens_tensor,
+                    prefill_meta.context_lens_tensor,
+                    prefill_meta.max_query_len,
                     self.alibi_slopes,
                 )
-        else:
+        if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
                 query,
                 key_cache,
                 value_cache,
-                attn_metadata.block_tables,
-                attn_metadata.context_lens,
-                attn_metadata.max_context_len,
+                decode_meta.block_tables,
+                decode_meta.seq_lens_tensor,
+                decode_meta.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
+                kv_scale
             )
 
         # Reshape the output tensor.
@@ -293,13 +285,13 @@ def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     num_kv_heads: int,
     dtype: torch.dtype,
-    prompt_lens: List[int],
+    seq_lens: List[int],
 ) -> LowerTriangularMaskWithTensorBias:
     attn_biases = []
-    for prompt_len in prompt_lens:
-        bias = torch.arange(prompt_len, dtype=dtype)
+    for seq_len in seq_lens:
+        bias = torch.arange(seq_len, dtype=dtype)
         # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(prompt_len, 1)`
+        #     `bias = bias[None, :].repeat(seq_len, 1)`
         # here. We find that both biases give the same results, but
         # the bias below more accurately follows the original ALiBi
         # paper.
@@ -307,46 +299,19 @@ def _make_alibi_bias(
         # element.
         bias = bias[None, :] - bias[:, None]
 
-        padded_len = (prompt_len + 7) // 8 * 8
+        padded_len = (seq_len + 7) // 8 * 8
         num_heads = alibi_slopes.shape[0]
         bias = torch.empty(
             1,  # batch size
             num_heads,
-            prompt_len,
+            seq_len,
             padded_len,
             device=alibi_slopes.device,
             dtype=dtype,
-        )[:, :, :, :prompt_len].copy_(bias)
+        )[:, :, :, :seq_len].copy_(bias)
         bias.mul_(alibi_slopes[:, None, None])
         if num_heads != num_kv_heads:
             bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
         attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
 
     return attn_biases
-
-
-def _naive_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    num_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    scale: float,
-) -> torch.Tensor:
-    query = query.view(-1, num_heads, head_size)
-    key = key.view(-1, num_kv_heads, head_size)
-    value = value.view(-1, num_kv_heads, head_size)
-    seq_len, _, _ = query.shape
-    attn_mask = torch.triu(torch.ones(seq_len,
-                                      seq_len,
-                                      dtype=query.dtype,
-                                      device=query.device),
-                           diagonal=1)
-    attn_mask = attn_mask * torch.finfo(query.dtype).min
-
-    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    attn_weights = attn_weights + attn_mask.float()
-    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
-    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
-    return out
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index 03027bb01565c..8dc79f17f8c9c 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -16,17 +16,11 @@
 @dataclass
 class HabanaPagedAttentionMetadata:
     """Metadata for PagedAttention."""
-    # (num_tokens,). The indices of the token slots that input tokens will be
-    # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size
-    # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot
-    # in block 0, and 1st slot in block 1, respectively.
-    slot_mapping: torch.Tensor
-    # (batch_size,). The length of context (tokens stored in KV cache) per
-    # sequence. WARNING: When it is a prefill request, it doesn't include new
-    # tokens. When it is for decoding, it includes a new token.
-    context_lens: Optional[torch.Tensor]
-    # Maximum context length in the batch.
-    max_context_len: Optional[int]
+    # (batch_size,). The length of sequences (entire tokens seen so far) per
+    # sequence.
+    seq_lens_tensor: Optional[torch.Tensor]
+    # Maximum sequence length in the batch.
+    max_seq_len: Optional[int]
     # (batch_size, max_blocks_per_seq).
     # Block addresses per sequence. (Seq id -> list of physical block)
     # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
@@ -34,7 +28,6 @@ class HabanaPagedAttentionMetadata:
     # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
     # captured.
     block_tables: Optional[torch.Tensor]
-    kv_cache_dtype: str
 
 
 class HabanaPagedAttention:
@@ -88,12 +81,13 @@ def forward_decode(
         key_cache: torch.Tensor,
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
+        seq_lens: torch.Tensor,
+        max_seq_len: int,
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
+        kv_scale: float,
     ) -> torch.Tensor:
         block_size = value_cache.shape[3]
         return ops.paged_attention_v1(
@@ -103,9 +97,9 @@ def forward_decode(
             num_kv_heads,
             scale,
             block_tables,
-            context_lens,
+            seq_lens,
             block_size,
-            max_context_len,
+            max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
         )
@@ -119,10 +113,11 @@ def forward_prefix(
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
         subquery_start_loc: torch.Tensor,
-        prompt_lens_tensor: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
         context_lens: torch.Tensor,
-        max_subquery_len: int,
+        max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
     ) -> torch.Tensor:
         raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention")
 
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index dd211eadbea78..cc035f397aa6d 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -2,119 +2,90 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-from typing import Dict, List, Optional
-
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from typing import Any, Dict, List, Optional, Set, Tuple
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.utils import check_block_size_valid
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                         make_async, HabanaMemoryProfiler, format_bytes)
 import os
 import contextlib
+from vllm.worker.worker_base import WorkerWrapperBase
+
 logger = init_logger(__name__)
 
 
 class HabanaExecutor(ExecutorBase):
-
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-
-        # Instantiate the worker and load the model to GPU.
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model."""
         self._init_worker()
 
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
-
-    def _init_worker(self):
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.habana_worker import HabanaWorker
-
-        assert self.parallel_config.world_size == 1, (
-            "HabanaExecutor only supports single GPU.")
-
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = HabanaWorker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            local_rank=0,
-            rank=0,
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=local_rank,
+            rank=rank,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=True,
+            vision_language_config=self.vision_language_config,
+            is_driver_worker=rank == 0,
         )
-        self.driver_worker.init_device()
-        self.driver_worker.load_model()
 
-    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache.
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        wrapper = WorkerWrapperBase(
+            worker_module_name="vllm.worker.habana_worker",
+            worker_class_name="HabanaWorker",
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+    def _init_worker(self):
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
 
-        The engine first profiles the existing memory usage.
-        Then, it allocates the remaining memory for KV blocks.
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+    def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
         """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_gpu_blocks, num_cpu_blocks = (
-            self.driver_worker.profile_num_available_blocks(
-                block_size=self.cache_config.block_size,
-                hpu_memory_utilization=self.cache_config.
-                gpu_memory_utilization,
-                cpu_swap_space=self.cache_config.swap_space_bytes,
-                cache_dtype=self.cache_config.cache_dtype,
-            ))
-
-        logger.info(f"# HPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
-
-        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
-                               self.model_config.max_model_len)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        # Initialize the cache.
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+
         with HabanaMemoryProfiler() as cache_init_m:
-            self.driver_worker.init_cache_engine(cache_config=self.cache_config)
+            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
         logger.info(f"init_cache_engine took "
                     f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
 
-        # Warm up the model. This includes capturing the model into CUDA graph
-        # if enforce_eager is False.
-        with HabanaMemoryProfiler() as warmup_m:
-            self.driver_worker.warm_up_model()
-        logger.info(f"Model warmup took "
-                    f"{format_bytes(warmup_m.consumed_memory)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
-
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
-
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS!
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any
@@ -132,12 +103,7 @@ def execute_model(self,
             gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext()
             cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext()
             with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric:
-                output = self.driver_worker.execute_model(
-                    seq_group_metadata_list=seq_group_metadata_list,
-                    blocks_to_swap_in=blocks_to_swap_in,
-                    blocks_to_swap_out=blocks_to_swap_out,
-                    blocks_to_copy=blocks_to_copy,
-                )
+                output = self.driver_worker.execute_model(execute_model_req)
             if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
                 logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}")
             if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all:
@@ -145,12 +111,7 @@ def execute_model(self,
             
             return output
 
-        output = self.driver_worker.execute_model(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-        )
+        output = self.driver_worker.execute_model(execute_model_req)
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -172,19 +133,8 @@ class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
         self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> SamplerOutput:
-        output = await make_async(self.driver_worker.execute_model)(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy)
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
         return output
-
-    async def check_health_async(self) -> None:
-        # GPUExecutor will always be healthy as long as
-        # it's running.
-        return
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 40ba2130ccd9e..605009e8f695c 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -50,7 +50,7 @@ def forward(
             # Get the logits for the next tokens.
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
-        if logits is not None and sampling_metadata.perform_sampling:
+        if logits is not None: # and sampling_metadata.perform_sampling: FIXME: this is needed for 8xHPU
             logits *= self.scale
 
             # Apply logits processors (if any).
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 7ced639a7fb03..a25a09c2598fd 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -4,7 +4,8 @@
 
 import contextlib
 import time
-from typing import Dict, List, Optional, Set, Tuple
+from enum import IntEnum
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple
 
 # for logging hpugraph capture
 import tqdm
@@ -21,20 +22,18 @@
 import habana_frameworks.torch as htorch
 from habana_frameworks.torch.hpu.metrics import metric_localcontext
 
-from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
+                            get_attn_backend)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.distributed import broadcast_tensor_dict
+from vllm.distributed.device_communicators import custom_all_reduce
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.parallel_utils import cupy_utils, custom_all_reduce
-from vllm.model_executor.parallel_utils.communication_op import (
-    broadcast_tensor_dict)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    with_cupy_nccl_for_all_reduce)
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d,
@@ -59,6 +58,66 @@
 ]
 
 
+class PreparePromptMetadata(NamedTuple):
+    input_tokens: List[int]
+    input_positions: List[int]
+    attn_metadata: Optional[AttentionMetadataPerStage]
+    seq_lens: List[int]
+    query_lens: List[int]
+    lora_index_mapping: List[int]
+    lora_prompt_mapping: List[int]
+    lora_requests: Set[LoRARequest]
+    multi_modal_input: Optional[torch.Tensor]
+    slot_mapping: List[int]
+
+    @classmethod
+    def empty(cls):
+        return PreparePromptMetadata(
+            input_tokens=[],
+            input_positions=[],
+            attn_metadata=None,
+            seq_lens=[],
+            query_lens=[],
+            lora_index_mapping=[],
+            lora_prompt_mapping=[],
+            lora_requests=set(),
+            multi_modal_input=None,
+            slot_mapping=[],
+        )
+
+
+class PrepareDecodeMetadata(NamedTuple):
+    input_tokens: List[int]
+    input_positions: List[int]
+    attn_metadata: Optional[AttentionMetadata]
+    lora_index_mapping: List[int]
+    lora_prompt_mapping: List[int]
+    lora_requests: Set[LoRARequest]
+    slot_mapping: List[int]
+
+    @classmethod
+    def empty(cls):
+        return PrepareDecodeMetadata(
+            input_tokens=[],
+            input_positions=[],
+            attn_metadata=None,
+            lora_index_mapping=[],
+            lora_prompt_mapping=[],
+            lora_requests=set(),
+            slot_mapping=[],
+        )
+
+
+# How batches are constructed.
+class BatchType(IntEnum):
+    # Every batch is prefill.
+    PREFILL = 0
+    # Every batch is decode.
+    DECODE = 1
+    # Batch is a mixture of prefill and decode.
+    MIXED = 2
+
+
 class HabanaModelRunner:
 
     def __init__(
@@ -67,14 +126,17 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
+        self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
         # model_config can be None in tests/samplers/test_sampler.py.
@@ -85,35 +147,45 @@ def __init__(
                               if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
 
-        self.model = None
-        self.block_size = None  # Set after initial profiling.
-        self.lora_manager = None
+        # Set after load_model.
+        self.lora_manager: LRUCacheWorkerLoRAManager = None
+
         self.graph_runner_class = HPUGraphRunner
         self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {}
 
-        self.max_context_len_to_capture = (
-            self.model_config.max_context_len_to_capture
-            if self.model_config is not None else 0)
+        self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture
+                                       if self.model_config is not None else 0)
+
+        self.pin_memory = is_pin_memory_available()
+        self.kv_cache_dtype = kv_cache_dtype
+        self.vision_language_config = vision_language_config
+
+        self.attn_backend = get_attn_backend(
+            self.model_config.dtype if model_config is not None else None)
+
+        # Lazy initialization
+        self.model: torch.nn.Module  # Set after load_model
+        self.block_size: int  # Set after initial profiling.
         # When using CUDA graph, the input block tables must be padded to
-        # max_context_len_to_capture. However, creating the block table in
+        # max_seq_len_to_capture. However, creating the block table in
         # Python can be expensive. To optimize this, we cache the block table
         # in numpy and only copy the actual input content at every iteration.
         # The shape of the cached block table will be
         # (max batch size to capture, max context len to capture / block size).
-        self.graph_block_tables = None  # Set after initial profiling.
-        self.pin_memory = is_pin_memory_available()
-        self.kv_cache_dtype = kv_cache_dtype
+        self.graph_block_tables: torch.Tensor  # Set after initial profiling.
 
-        self.attn_backend = get_attn_backend(
-            self.model_config.dtype if model_config is not None else None)
 
     def load_model(self) -> None:
         with HabanaMemoryProfiler() as m:
-            self.model = get_model(self.model_config,
-                                   self.device_config,
-                                   lora_config=self.lora_config,
-                                   parallel_config=self.parallel_config,
-                                   scheduler_config=self.scheduler_config)
+            self.model = get_model(
+                model_config=self.model_config,
+                device_config=self.device_config,
+                load_config=self.load_config,
+                lora_config=self.lora_config,
+                vision_language_config=self.vision_language_config,
+                parallel_config=self.parallel_config,
+                scheduler_config=self.scheduler_config,
+            )
 
         self.model_memory_usage = m.consumed_memory
         logger.info(f"Loading model weights took "
@@ -144,14 +216,12 @@ def set_block_size(self, block_size: int) -> None:
 
     def get_max_block_per_batch(self) -> int:
         block_size = self.block_size
-        return (self.max_context_len_to_capture + block_size - 1) // block_size
+        return (self.max_seq_len_to_capture + block_size - 1) // block_size
 
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               List[int], List[int], List[int], Set[LoRARequest]]:
-        assert len(seq_group_metadata_list) > 0
+    ) -> PreparePromptMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
@@ -159,78 +229,107 @@ def _prepare_prompt(
         lora_prompt_mapping: List[List[int]] = []
         lora_requests: Set[LoRARequest] = set()
 
-        prompt_lens: List[int] = []
+        seq_lens: List[int] = []
         context_lens: List[int] = []
-        subquery_lens: List[int] = []
+        query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
+        multi_modal_input_list: List[torch.Tensor] = []
+
+        if len(seq_group_metadata_list) == 0:
+            return PreparePromptMetadata.empty()
+
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
             assert len(seq_ids) == 1
             seq_id = seq_ids[0]
 
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if (self.scheduler_config is not None
+                    and self.scheduler_config.chunked_prefill_enabled
+                    and not (computed_block_nums is None
+                             or computed_block_nums == [])):
+                raise RuntimeError(
+                    "chunked prefill cannot be used with prefix caching "
+                    "now.")
+
+            token_chunk_size = seq_group_metadata.token_chunk_size
             seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            prompt_len = len(prompt_tokens)
-            prompt_lens.append(prompt_len)
-            computed_len = 0
+            context_len = seq_data.get_num_computed_tokens()
+            # We should use get_len here because in case of preemption
+            # it contains output tokens.
+            seq_len = min(seq_data.get_len(), context_len + token_chunk_size)
+            prompt_tokens = seq_data.get_token_ids()[context_len:seq_len]
+            seq_lens.append(seq_len)
 
             # NOTE: This only works for oooooooxxx style attention.
-            computed_block_nums = seq_group_metadata.computed_block_nums
             if computed_block_nums is not None and len(
                     computed_block_nums) > 0 and self.sliding_window is None:
                 # Prefix is not supported with sliding_window
-                computed_len = len(computed_block_nums) * self.block_size
-                prompt_tokens = prompt_tokens[computed_len:]
+                context_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[context_len:]
                 prefix_block_tables.append(computed_block_nums)
-                context_len = computed_len
+            elif self.scheduler_config.chunked_prefill_enabled:
+                if seq_group_metadata.block_tables is not None:
+                    # Prefill has chunked before.
+                    block_table = seq_group_metadata.block_tables[seq_id]
+                    prefix_block_tables.append(block_table)
+                else:
+                    # The first prefill.
+                    prefix_block_tables.append([])
             else:
                 prefix_block_tables.append([])
-                context_len = 0
+                # Right now, prefill start is always 0. However, this
+                # assumption can be changed once chunked prefill is introduced.
+                assert context_len == 0
+
             # actual prompt lens
             context_lens.append(context_len)
-            if computed_len != 0:
+            if context_len != 0:
                 import pdb; pdb.set_trace() # what happens if we hit that path??
-            subquery_lens.append(prompt_len - computed_len)
+            query_lens.append(seq_len - context_len)
 
             input_tokens.append(prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
-            input_positions.append(
-                list(range(computed_len, computed_len + len(prompt_tokens))))
-
+            input_positions.append(list(range(context_len, seq_len)))
             lora_id = seq_group_metadata.lora_int_id
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
 
-            lora_index_mapping += [lora_id] * (prompt_len - computed_len)
+            lora_index_mapping += [lora_id] * (seq_len - context_len)
             lora_prompt_mapping.append(
                 [lora_id] *
-                (prompt_len - computed_len
+                (seq_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
                 # yet. In this case, we just use a dummy slot mapping.
-                slot_mapping.append([_PAD_SLOT_ID] * prompt_len)
+                slot_mapping.append([_PAD_SLOT_ID] * seq_len)
                 continue
 
             # Compute the slot mapping.
             slot_mapping.append([])
             block_table = seq_group_metadata.block_tables[seq_id]
+
             # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
-            # where start_idx is max(0, prompt_len - sliding_window).
+            # where start_idx is max(0, seq_len - sliding_window).
             # For example, if the prompt len is 10, sliding window is 8, and
             # block size is 4, the first two tokens are masked and the slot
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                assert computed_len == 0, (
+                assert context_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
-                start_idx = max(0, prompt_len - self.sliding_window)
-            for i in range(computed_len, prompt_len):
+                start_idx = max(0, seq_len - self.sliding_window)
+            for i in range(context_len, seq_len):
                 if i < start_idx:
                     slot_mapping[-1].append(_PAD_SLOT_ID)
                     continue
@@ -240,18 +339,25 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping[-1].append(slot)
 
-        max_subquery_len = max(subquery_lens)
-        max_prompt_len = max(prompt_lens)
-        num_prompt_tokens = len(input_tokens)
-        assert max_subquery_len > 0
-       
-        lora_index_mapping = lora_index_mapping
+        max_query_len = max(query_lens)
+        max_seq_len = max(seq_lens)
+        assert max_query_len > 0
 
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
                                            device=self.device)
+
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
-        max_prompt_len = max(prompt_lens)
+        max_prompt_len = max(seq_lens)
         input_tokens = make_tensor_with_pad(input_tokens,
                                              max_prompt_len,
                                              pad=0,
@@ -271,6 +377,7 @@ def _prepare_prompt(
                                              device=self.device)
 
         # Prepare prefix block tables
+        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         block_tables = make_tensor_with_pad(
             prefix_block_tables,
             max_len=max_prompt_block_table_len,
@@ -281,67 +388,72 @@ def _prepare_prompt(
 
         # Query length can be shorter than key (i.e., prompt) when prefill
         # is chunked or prefix cached.
-        subquery_lens_tensor = torch.tensor(subquery_lens,
-                                            dtype=torch.long,
-                                            device=self.device)
-        subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
+        query_lens_tensor = torch.tensor(query_lens,
+                                         dtype=torch.long,
+                                         device=self.device)
+        subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
                                          dtype=torch.int32,
                                          device=self.device)
 
-        prompt_lens_tensor = torch.tensor(prompt_lens,
-                                          dtype=torch.long,
-                                          device=self.device)
-        seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.long,
+                                       device=self.device)
+        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
                                     dtype=torch.int32,
                                     device=self.device)
 
-        torch.cumsum(subquery_lens_tensor,
+        torch.cumsum(query_lens_tensor,
                      dim=0,
                      dtype=subquery_start_loc.dtype,
                      out=subquery_start_loc[1:])
 
-        torch.cumsum(prompt_lens_tensor,
+        torch.cumsum(seq_lens_tensor,
                      dim=0,
                      dtype=seq_start_loc.dtype,
                      out=seq_start_loc[1:])
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
-            slot_mapping=slot_mapping,
-            prompt_lens=prompt_lens,
-            prompt_lens_tensor=prompt_lens_tensor,
-            num_prompt_tokens=num_prompt_tokens,
-            num_generation_tokens=0,
-            max_subquery_len=max_subquery_len,
-            max_context_len=None,
-            max_prompt_len=max_prompt_len,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
             subquery_start_loc=subquery_start_loc,
             seq_start_loc=seq_start_loc,
-            context_lens=context_lens_tensor,
+            context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
-            kv_cache_dtype=self.kv_cache_dtype,
         )
-        return (input_tokens, input_positions, attn_metadata, prompt_lens,
-                subquery_lens, lora_index_mapping, lora_prompt_mapping,
-                lora_requests)
-
+        return PreparePromptMetadata(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
+            lora_index_mapping=lora_index_mapping,
+            lora_prompt_mapping=lora_prompt_mapping,
+            lora_requests=lora_requests,
+            multi_modal_input=multi_modal_input,
+            slot_mapping=slot_mapping,
+        )
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               List[int], Set[LoRARequest]]:
-        assert len(seq_group_metadata_list) > 0
+    ) -> PrepareDecodeMetadata:
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
-        context_lens: List[int] = []
+        seq_lens: List[int] = []
         block_tables: List[List[int]] = []
         lora_index_mapping: List[int] = []
         lora_prompt_mapping: List[int] = []
         lora_requests: Set[LoRARequest] = set()
 
+        if len(seq_group_metadata_list) == 0:
+            return PrepareDecodeMetadata.empty()
+
         for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1
 
             seq_ids = list(seq_group_metadata.seq_data.keys())
             lora_id = seq_group_metadata.lora_int_id
@@ -358,9 +470,9 @@ def _prepare_decode(
                 position = seq_len - 1
                 input_positions.append([position])
 
-                context_len = seq_len if self.sliding_window is None else min(
+                seq_len = seq_len if self.sliding_window is None else min(
                     seq_len, self.sliding_window)
-                context_lens.append(context_len)
+                seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
                 block_number = block_table[position // self.block_size]
@@ -380,11 +492,11 @@ def _prepare_decode(
         # See `capture_model` API for more details.
         # For decoding requests, batch_size == input_tokens.
         batch_size = len(input_tokens)
-        max_context_len = max(context_lens)
+        max_seq_len = max(seq_lens)
         use_captured_graph = (
             not self.model_config.enforce_eager
             and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
-            and max_context_len <= self.max_context_len_to_capture)
+            and max_seq_len <= self.max_seq_len_to_capture)
         if use_captured_graph:
             graph_batch_size = _get_graph_batch_size(batch_size)
             assert graph_batch_size >= batch_size
@@ -392,7 +504,7 @@ def _prepare_decode(
                 input_tokens.append([0])
                 input_positions.append([0])
                 slot_mapping.append([_PAD_SLOT_ID])
-                context_lens.append(1)
+                seq_lens.append(1)
                 block_tables.append([])
                 lora_index_mapping.append(0)
             batch_size = graph_batch_size
@@ -406,24 +518,23 @@ def _prepare_decode(
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
+        seq_lens_tensor = torch.tensor(seq_lens,
+                                       dtype=torch.int,
+                                       device=self.device)
 
         if use_captured_graph:
             # When using cuda-graph all these tensors should be
             # padded.
-            assert context_lens.shape[0] == input_tokens.shape[0]
-            assert context_lens.shape[0] == input_positions.shape[0]
-            assert context_lens.shape[0] == slot_mapping.shape[0]
+            assert seq_lens_tensor.shape[0] == len(input_tokens)
+            assert seq_lens_tensor.shape[0] == len(input_positions)
+            assert seq_lens_tensor.shape[0] == len(slot_mapping)
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
-            graph_max_context_len  = _get_graph_max_context_len(max_context_len)
-            assert graph_max_context_len >= max_context_len
-            graph_block_count = math.ceil(graph_max_context_len / self.block_size)
+            graph_max_seq_len  = _get_graph_max_context_len(max_seq_len)
+            assert graph_max_seq_len >= max_seq_len
+            graph_block_count = math.ceil(graph_max_seq_len / self.block_size)
             input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count]
-            
             for i, block_table in enumerate(block_tables):
                 if block_table:
                     input_block_tables[i, :len(block_table)] = block_table
@@ -438,26 +549,28 @@ def _prepare_decode(
                 dtype=torch.int,
                 device=self.device,
             )
-
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
-            slot_mapping=slot_mapping,
-            prompt_lens=None,
-            prompt_lens_tensor=None,
-            num_prompt_tokens=0,
-            num_generation_tokens=len(input_tokens),
-            max_subquery_len=None,
-            max_context_len=max_context_len,
-            max_prompt_len=None,
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=None,
+            max_seq_len=max_seq_len,
             subquery_start_loc=None,
             seq_start_loc=None,
-            context_lens=context_lens,
+            context_lens_tensor=None,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
-            kv_cache_dtype=self.kv_cache_dtype,
         )
-        return (input_tokens, input_positions, attn_metadata,
-                lora_index_mapping, lora_prompt_mapping, lora_requests)
+        return PrepareDecodeMetadata(
+            input_tokens=input_tokens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            lora_index_mapping=lora_index_mapping,
+            lora_prompt_mapping=lora_prompt_mapping,
+            lora_requests=lora_requests,
+            slot_mapping=slot_mapping,
+        )
+
 
     def _prepare_sample(
         self,
@@ -558,6 +671,164 @@ def _prepare_sample(
         return sampling_metadata
 
     def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+        if self.is_driver_worker:
+            prefill_reqs = []
+            decode_reqs = []
+            for seq_group_meta in seq_group_metadata_list:
+                if seq_group_meta.is_prompt:
+                    prefill_reqs.append(seq_group_meta)
+                else:
+                    decode_reqs.append(seq_group_meta)
+
+            # Prepare input tensors.
+            (
+                input_tokens,
+                input_positions,
+                prefill_attn_metadata,
+                seq_lens,
+                query_lens,
+                lora_index_mapping,
+                lora_prompt_mapping,
+                lora_requests,
+                multi_modal_input,
+                slot_mapping,
+            ) = self._prepare_prompt(prefill_reqs)
+            (
+                decode_input_tokens,
+                decode_input_positions,
+                decode_attn_metadata,
+                decode_lora_index_mapping,
+                decode_lora_prompt_mapping,
+                decode_lora_requests,
+                decode_slot_mapping,
+            ) = self._prepare_decode(decode_reqs)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, seq_lens, query_lens, self.device,
+                self.pin_memory)
+
+            if not self.scheduler_config.chunked_prefill_enabled:
+                assert (len(prefill_reqs) and len(decode_reqs)) == 0
+
+            num_prefills = len(seq_lens)
+            num_prefill_tokens = len(input_tokens)
+            num_decode_tokens = len(decode_input_tokens)
+
+            # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. 
+            assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!"
+            if num_decode_tokens > 0:
+                input_tokens = decode_input_tokens
+                input_positions = decode_input_positions
+                slot_mapping = decode_slot_mapping
+                lora_index_mapping = decode_lora_index_mapping
+                lora_prompt_mapping = decode_lora_prompt_mapping
+                lora_requests = decode_lora_requests
+
+            if self.lora_config:
+                lora_mapping = LoRAMapping(
+                    lora_index_mapping,
+                    lora_prompt_mapping,
+                )
+            else:
+                lora_mapping = None
+
+            # Broadcast the metadata.
+            # If batch contains both prefill and decode, it sends 2 broadcasts.
+            # If it only contains 1 type, it triggers a single broadcast.
+            if (prefill_attn_metadata is not None
+                    and decode_attn_metadata is not None):
+                batch_type = BatchType.MIXED
+                raise NotImplementedError("Mixed batch is not supported on HPU")
+            elif prefill_attn_metadata is not None:
+                batch_type = BatchType.PREFILL
+            else:
+                batch_type = BatchType.DECODE
+
+            metadata_dict = {
+                "input_tokens": input_tokens,
+                "input_positions": input_positions,
+                "selected_token_indices":
+                sampling_metadata.selected_token_indices,
+                "lora_requests": lora_requests,
+                "lora_mapping": lora_mapping,
+                "multi_modal_input": multi_modal_input,
+                "num_prefill_tokens": num_prefill_tokens,
+                "num_decode_tokens": num_decode_tokens,
+                "slot_mapping": slot_mapping,
+                "num_prefills": num_prefills,
+                "batch_type": batch_type,
+            }
+            if prefill_attn_metadata is not None:
+                metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+            else:
+                assert decode_attn_metadata is not None
+                metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
+            broadcast_tensor_dict(metadata_dict, src=0)
+
+            # Broadcast decode attn metadata for mixed batch type.
+            # The additional broadcast costs 300us overhead on 4 A10 GPUs.
+            # We can potentially reduce the overhead by coelescing tensors.
+            if batch_type == BatchType.MIXED:
+                assert decode_attn_metadata is not None
+                metadata_dict = decode_attn_metadata.asdict_zerocopy()
+                broadcast_tensor_dict(metadata_dict, src=0)
+        else:
+            metadata_dict = broadcast_tensor_dict(src=0)
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            slot_mapping = metadata_dict.pop("slot_mapping")
+            num_prefills = metadata_dict.pop("num_prefills")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            lora_mapping = metadata_dict.pop("lora_mapping")
+            lora_requests = metadata_dict.pop("lora_requests")
+            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            num_prefill_tokens = metadata_dict.pop("num_prefill_tokens")
+            num_decode_tokens = metadata_dict.pop("num_decode_tokens")
+            batch_type = metadata_dict.pop("batch_type")
+
+            # Create an attention metadata.
+            prefill_attn_metadata = None
+            decode_attn_metadata = None
+            if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED:
+                prefill_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+            else:
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                num_prompts=0,
+            )
+
+            # if it is a mixed batch, decode attn_metadata is broadcasted
+            # separately.
+            if batch_type == BatchType.MIXED:
+                metadata_dict = broadcast_tensor_dict(src=0)
+                decode_attn_metadata = self.attn_backend.make_metadata(
+                    **metadata_dict)
+
+        attn_metadata = AttentionMetadata(
+            num_prefills=num_prefills,
+            slot_mapping=slot_mapping,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            prefill_metadata=prefill_attn_metadata,
+            decode_metadata=decode_attn_metadata,
+            kv_cache_dtype=self.kv_cache_dtype,
+        )
+
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, lora_requests, lora_mapping,
+                multi_modal_input)
+
+
+    def _old_prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
@@ -629,14 +900,16 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests,
-         lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
+         lora_requests, lora_mapping, multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
 
-        # Execute the model.
-        if attn_metadata.use_cuda_graph:
+        # Currently HPU graph is only supported by the decode phase.
+        prefill_meta = attn_metadata.prefill_metadata
+        decode_meta = attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
             graph_batch_size = input_tokens.shape[0]
             graph_block_count = attn_metadata.block_tables.shape[1] 
             graph_runner_key = (graph_batch_size, graph_block_count)
@@ -644,25 +917,30 @@ def execute_model(
             logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})")
         else:
             model_executable = self.model
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-        )
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+        hidden_states = model_executable(**execute_model_kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-         # Compute the logits.
+
+        # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
         # Only perform sampling in the driver worker.
-        if not sampling_metadata.perform_sampling:
+        if not self.is_driver_worker:
             return None
-        
+
         # Sample the next token.
         output = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
         )
+
         return output
 
     @torch.inference_mode()
@@ -697,6 +975,17 @@ def profile_run(self) -> None:
         # Profile memory usage with max_num_sequences sequences and the total
         # number of tokens equal to max_num_batched_tokens.
         seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for vision encoding, which needs
+        # to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+        if self.vision_language_config:
+            max_num_seqs = min(
+                max_num_seqs,
+                int(max_num_batched_tokens /
+                    self.vision_language_config.image_feature_size))
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
@@ -719,12 +1008,12 @@ def profile_run(self) -> None:
         torch.hpu.synchronize()
         return
 
-    def remove_all_loras(self) -> bool:
+    def remove_all_loras(self):
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_all_loras()
+        self.lora_manager.remove_all_loras()
 
-    def set_active_loras(self, lora_requests: List[LoRARequest],
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
                          lora_mapping: LoRAMapping) -> None:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
@@ -761,7 +1050,6 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         """
         # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
         # deleted before the CUDA graphs.
-        self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
 
         assert not self.model_config.enforce_eager
         logger.info("Capturing the model for HPUGraphs. This may lead to "
@@ -841,10 +1129,9 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 # Create dummy attn_metadata.
                 attn_metadata = self.attn_backend.make_metadata(
                     is_prompt=False,
-                    slot_mapping=slot_mapping[:batch_size],
                     prompt_lens=None,
                     prompt_lens_tensor=None,
-                    num_prompt_tokens=0,
+                    num_prefill_tokens=0,
                     num_generation_tokens=batch_size,
                     max_subquery_len=None,
                     max_context_len=block_count*self.block_size,
@@ -900,7 +1187,6 @@ def __del__(self) -> None:
         # happen.
         # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
         self.graph_runners.clear()
-        self.cupy_nccl_backend = None
 
     @property
     def vocab_size(self) -> int:
@@ -1079,10 +1365,9 @@ def __init__(self, model):
             def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables):
                 wrapper_attn_metadata = self.attn_backend.make_metadata(
                     is_prompt=attn_metadata.is_prompt,
-                    slot_mapping=slot_mapping,
-                    prompt_lens=None,
-                    prompt_lens_tensor=None,
-                    num_prompt_tokens=0,
+                    seq_lens=None,
+                    seq_lens_tensor=None, 
+                    num_prefill_tokens=0,
                     num_generation_tokens=attn_metadata.num_generation_tokens,
                     max_subquery_len=None,
                     max_context_len=attn_metadata.max_context_len,
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index bbfd7dad7f90a..a05eee90648b2 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -4,27 +4,28 @@
 
 import gc
 import os
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import torch
 import habana_frameworks.torch as htorch
 import torch.distributed
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
+from vllm.distributed import (broadcast_tensor_dict,
+                              ensure_model_parallel_initialized,
+                              get_tensor_model_parallel_cpu_group,
+                              init_distributed_environment)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
-from vllm.model_executor.parallel_utils.communication_op import (
-    broadcast_tensor_dict)
-from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
-from vllm.model_executor.parallel_utils.parallel_state import (
-    ensure_model_parallel_initialized)
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
+from vllm.worker.worker_base import WorkerBase
 
 
-class HabanaWorker:
+class HabanaWorker(WorkerBase):
     """A worker class that executes (a partition of) the model on a HPU.
 
     Each worker is associated with a single HPU. The worker is responsible for
@@ -38,37 +39,51 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
         local_rank: int,
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        kv_cache_dtype: Optional[str] = "auto",
+        vision_language_config: Optional[VisionLanguageConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
         self.device_config = device_config
+        self.cache_config = cache_config
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
+        self.load_config = load_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
 
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+        self.vision_language_config = vision_language_config
+        if self.vision_language_config:
+            assert not self.lora_config, (
+                "To be tested: vision language model with LoRA settings.")
+            assert False, "To be tested: vision language model on HPU"
+
         self.model_runner = HabanaModelRunner(model_config,
                                         parallel_config,
                                         scheduler_config,
                                         device_config,
+                                        load_config=load_config,
                                         lora_config=self.lora_config,
-                                        kv_cache_dtype=kv_cache_dtype,
+                                        kv_cache_dtype=self.cache_config.cache_dtype,
                                         is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
-        # self.init_cache_engine().
-        self.cache_config = None
-        self.cache_engine = None
-        self.hpu_cache = None
+        # initialize_cache.
+        self.cache_engine: CacheEngine
+        self.hpu_cache: List[torch.Tensor]
 
     def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
@@ -79,8 +94,9 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        init_distributed_environment(self.parallel_config, self.rank,
-                                     self.distributed_init_method)
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
         # Set random seed.
         set_random_seed(self.model_config.seed)
 
@@ -88,20 +104,17 @@ def load_model(self):
         self.model_runner.load_model()
 
     @torch.inference_mode()
-    def profile_num_available_blocks(
-        self,
-        block_size: int,
-        hpu_memory_utilization: float,
-        cpu_swap_space: int,
-        cache_dtype: str,
-    ) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model and returns the maximum
-        number of HPU and CPU cache blocks that can be allocated.
-
-        Args:
-            block_size: The size of the cache block.
-            hpu_memory_utilization: The fraction of the total HPU memory to use.
-            cpu_swap_space: The size of the CPU swap space in bytes.
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
@@ -118,15 +131,15 @@ def profile_num_available_blocks(
         # HPU did not change their memory usage during the profiling.
         peak_memory = self.init_hpu_memory - free_hpu_memory
         assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the hpu memory was "
+            "Error in memory profiling. This happens when the HPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
-        cache_block_size = self.get_cache_block_size_bytes(
-            block_size, cache_dtype)
+        cache_block_size = self.get_cache_block_size_bytes()
         num_hpu_blocks = int(
-            (total_hpu_memory * hpu_memory_utilization - peak_memory) //
-            cache_block_size)
-        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+            (total_hpu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
         num_hpu_blocks = max(num_hpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
         if self.model_runner.lora_manager:
@@ -134,15 +147,31 @@ def profile_num_available_blocks(
         gc.collect()
         return num_hpu_blocks, num_cpu_blocks
 
-    def init_cache_engine(self, cache_config: CacheConfig) -> None:
-        self.cache_config = cache_config
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
+
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(num_gpu_blocks,
+                                    self.cache_config.block_size,
+                                    self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self._init_cache_engine()
+        self._warm_up_model()
+
+    def _init_cache_engine(self) -> None:
+        assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
                                         self.parallel_config)
         self.hpu_cache = self.cache_engine.gpu_cache
         self.model_runner.set_block_size(self.cache_engine.block_size)
         htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution
 
-    def warm_up_model(self) -> None:
+    def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model(self.hpu_cache)
         # Reset the seed to ensure that the random state is not affected by
@@ -153,7 +182,7 @@ def cache_swap(
         self,
         blocks_to_swap_in: Dict[int, int],
         blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
+        blocks_to_copy: torch.Tensor,
     ) -> None:
         # Issue cache operations.
         # TODO(woosuk): Profile swapping overhead and optimize if needed.
@@ -161,24 +190,29 @@ def cache_swap(
             self.cache_engine.swap_in(blocks_to_swap_in)
         if blocks_to_swap_out:
             self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy:
+        if blocks_to_copy.numel() > 0:
             self.cache_engine.copy(blocks_to_copy)
 
     @torch.inference_mode()
     def execute_model(
         self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
-        blocks_to_swap_in: Optional[Dict[int, int]] = None,
-        blocks_to_swap_out: Optional[Dict[int, int]] = None,
-        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
-    ) -> Optional[SamplerOutput]:
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        if execute_model_req is None:
+            seq_group_metadata_list = None
+        else:
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
+            assert execute_model_req is not None
             num_seq_groups = len(seq_group_metadata_list)
-            assert blocks_to_swap_in is not None
-            assert blocks_to_swap_out is not None
-            assert blocks_to_copy is not None
-            data = {
+            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
+            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
+            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                          device=self.device,
+                                          dtype=torch.int64).view(-1, 2)
+            data: Dict[str, Any] = {
                 "num_seq_groups": num_seq_groups,
                 "blocks_to_swap_in": blocks_to_swap_in,
                 "blocks_to_swap_out": blocks_to_swap_out,
@@ -196,11 +230,11 @@ def execute_model(
 
         # If there is no input, we don't need to execute the model.
         if num_seq_groups == 0:
-            return {}
+            return []
 
         output = self.model_runner.execute_model(seq_group_metadata_list,
                                                  self.hpu_cache)
-        return output
+        return [output]
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
@@ -219,21 +253,27 @@ def max_model_len(self) -> int:
     def vocab_size(self) -> int:
         return self.model_runner.vocab_size
 
-    def get_cache_block_size_bytes(self, block_size: int,
-                                   cache_dtype: str) -> int:
+    def get_cache_block_size_bytes(self) -> int:
         """Get the size of the KV cache block size in bytes.
         """
-        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
+        return CacheEngine.get_cache_block_size(self.cache_config,
                                                 self.model_config,
                                                 self.parallel_config)
 
 
-def init_distributed_environment(
+def init_worker_distributed_environment(
     parallel_config: ParallelConfig,
     rank: int,
     distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank, backend='hccl')
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
     if torch.distributed.is_initialized():
         torch_world_size = torch.distributed.get_world_size()
         if torch_world_size != parallel_config.world_size:
@@ -257,7 +297,17 @@ def init_distributed_environment(
     torch.distributed.all_reduce(torch.zeros(1).to('hpu'))
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
-
-    # Initialize a custom fast all-reduce implementation.
-    if not parallel_config.disable_custom_all_reduce:
-        init_custom_ar()
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
+                                max_model_len) -> None:
+    if num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")

From 737c767cefcfef179b65afd335bc13d5dda39917 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 8 May 2024 20:36:24 +0300
Subject: [PATCH 003/341] fix hpugraph capture/replay post rebase

---
 vllm/worker/habana_model_runner.py | 269 ++++++-----------------------
 1 file changed, 56 insertions(+), 213 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a25a09c2598fd..e418ccc1d5c62 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -52,9 +52,9 @@
 ]
 
 # Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048
-_MAX_CONTEXT_LEN_ALIGNMENT = 256
-_MAX_CONTEXT_LENS_TO_CAPTURE = [1, 32, 64, 128] + [
-    _MAX_CONTEXT_LEN_ALIGNMENT * i for i in range(1, 9)
+_MAX_SEQ_LEN_ALIGNMENT = 256
+_MAX_SEQ_LENS_TO_CAPTURE = [1, 32, 64, 128] + [
+    _MAX_SEQ_LEN_ALIGNMENT * i for i in range(1, 9)
 ]
 
 
@@ -531,7 +531,7 @@ def _prepare_decode(
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
-            graph_max_seq_len  = _get_graph_max_context_len(max_seq_len)
+            graph_max_seq_len  = _get_graph_max_seq_len(max_seq_len)
             assert graph_max_seq_len >= max_seq_len
             graph_block_count = math.ceil(graph_max_seq_len / self.block_size)
             input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count]
@@ -572,104 +572,6 @@ def _prepare_decode(
         )
 
 
-    def _prepare_sample(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-        subquery_lens: Optional[List[int]],
-    ) -> SamplingMetadata:
-        seq_groups: List[Tuple[List[int], SamplingParams]] = []
-        selected_token_indices: List[int] = []
-        generators: List[torch.Generator] = []
-        selected_token_start_idx = 0
-        categorized_sample_indices = {t: [] for t in SamplingType}
-        categorized_sample_indices_start_idx = 0
-        categorized_sampled_token_indices_start_idx = 0
-        max_subquery_len = max(subquery_lens) if subquery_lens else 1
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-            seq_groups.append((seq_ids, sampling_params))
-
-            if seq_group_metadata.is_prompt:
-                assert len(seq_ids) == 1
-                assert subquery_lens is not None
-                subquery_len = subquery_lens[i]
-                if sampling_params.prompt_logprobs is not None:
-                    # NOTE: prompt token positions do not need sample, skip
-                    categorized_sample_indices_start_idx += subquery_len - 1
-
-                categorized_sample_indices[
-                    sampling_params.sampling_type].append([
-                        categorized_sample_indices_start_idx,
-                        categorized_sampled_token_indices_start_idx
-                    ])
-                categorized_sample_indices_start_idx += 1
-                categorized_sampled_token_indices_start_idx += 1
-
-                if sampling_params.prompt_logprobs is not None:
-                    selected_token_indices.extend(
-                        range(selected_token_start_idx,
-                              selected_token_start_idx + subquery_len - 1))
-                selected_token_indices.append(selected_token_start_idx +
-                                              subquery_len - 1)
-                selected_token_start_idx += max_subquery_len
-
-                if sampling_params.seed is not None:
-                    seq_group_metadata.state.generator = torch.Generator(
-                        device=self.device).manual_seed(sampling_params.seed)
-            else:
-                num_seqs = len(seq_ids)
-                selected_token_indices.extend(
-                    range(selected_token_start_idx,
-                          selected_token_start_idx + num_seqs))
-                selected_token_start_idx += num_seqs
-
-                categorized_sample_indices[
-                    sampling_params.sampling_type].extend(
-                        zip(
-                            range(
-                                categorized_sample_indices_start_idx,
-                                categorized_sample_indices_start_idx +
-                                num_seqs),
-                            range(
-                                categorized_sampled_token_indices_start_idx,
-                                categorized_sampled_token_indices_start_idx +
-                                num_seqs)))
-                categorized_sample_indices_start_idx += num_seqs
-                categorized_sampled_token_indices_start_idx += num_seqs
-
-            if sampling_params.seed is not None:
-                generators.append(seq_group_metadata.state.generator)
-
-        selected_token_indices = async_tensor_h2d(selected_token_indices,
-                                                  dtype=torch.long,
-                                                  target_device=self.device,
-                                                  pin_memory=self.pin_memory)
-
-        categorized_sample_indices = {
-            t: maybe_expand_dim(
-                async_tensor_h2d(seq_ids,
-                                 dtype=torch.int,
-                                 target_device=self.device,
-                                 pin_memory=self.pin_memory), 2, 2)
-            for t, seq_ids in categorized_sample_indices.items()
-        }
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        sampling_metadata = SamplingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-            selected_token_indices=selected_token_indices,
-            categorized_sample_indices=categorized_sample_indices,
-            generators=generators,
-        )
-        return sampling_metadata
-
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -828,71 +730,6 @@ def prepare_input_tensors(
                 multi_modal_input)
 
 
-    def _old_prepare_input_tensors(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[int], LoRAMapping]:
-        if self.is_driver_worker:
-            # NOTE: We assume that all sequences in the group are all prompts or
-            # all decodes.
-            is_prompt = seq_group_metadata_list[0].is_prompt
-            # Prepare input tensors.
-            if is_prompt:
-                (input_tokens, input_positions, attn_metadata, prompt_lens,
-                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
-                 lora_requests) = self._prepare_prompt(seq_group_metadata_list)
-            else:
-                (input_tokens, input_positions, attn_metadata,
-                 lora_index_mapping, lora_prompt_mapping,
-                 lora_requests) = self._prepare_decode(seq_group_metadata_list)
-                prompt_lens = []
-                subquery_lens = None
-            sampling_metadata = self._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens)
-
-            if self.lora_config:
-                lora_mapping = LoRAMapping(
-                    lora_index_mapping,
-                    lora_prompt_mapping,
-                )
-            else:
-                lora_mapping = None
-
-            # Broadcast the metadata.
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-            }
-            metadata_dict.update(attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                seq_data=None,
-                prompt_lens=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                generators=None,
-                perform_sampling=False,
-            )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, lora_requests, lora_mapping)
-
     @torch.inference_mode()
     def execute_model(
         self,
@@ -911,10 +748,10 @@ def execute_model(
         decode_meta = attn_metadata.decode_metadata
         if prefill_meta is None and decode_meta.use_cuda_graph:
             graph_batch_size = input_tokens.shape[0]
-            graph_block_count = attn_metadata.block_tables.shape[1] 
+            graph_block_count = decode_meta.block_tables.shape[1] 
             graph_runner_key = (graph_batch_size, graph_block_count)
             model_executable = self.graph_runners[graph_runner_key]
-            logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})")
+            logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(decode_meta.seq_lens_tensor).item()})")
         else:
             model_executable = self.model
         execute_model_kwargs = {
@@ -1088,33 +925,33 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
             # NOTE: Capturing the largest batch size first may help reduce the
             # memory usage of CUDA graph.
             valid_combinations = []
-            total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_CONTEXT_LENS_TO_CAPTURE)
+            total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_SEQ_LENS_TO_CAPTURE)
             import pandas as pd
-            df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_CONTEXT_LENS_TO_CAPTURE)
-            for idx, (batch_size, max_context_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_CONTEXT_LENS_TO_CAPTURE))): 
-                block_count = math.ceil(max_context_len / self.block_size)
+            df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_SEQ_LENS_TO_CAPTURE)
+            for idx, (batch_size, max_seq_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_SEQ_LENS_TO_CAPTURE))): 
+                block_count = math.ceil(max_seq_len / self.block_size)
                 # Skip capture of "out-of-bound" batch sizes and context lengths
                 if batch_size > self.scheduler_config.max_num_seqs:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Batch out of bound.")
-                    df[max_context_len][batch_size] = 'batch OoB'
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Batch out of bound.")
+                    df[max_seq_len][batch_size] = 'batch OoB'
                     continue 
-                if max_context_len > self.max_context_len_to_capture:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Nax context length out of bound.")
-                    df[max_context_len][batch_size] = 'ctx OoB'
+                if max_seq_len > self.max_seq_len_to_capture:
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Nax context length out of bound.")
+                    df[max_seq_len][batch_size] = 'ctx OoB'
                     continue
-                block_count = math.ceil(max_context_len / self.block_size)
+                block_count = math.ceil(max_seq_len / self.block_size)
                 captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size]
                 if block_count in captured_block_counts:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Block size already captured.")
-                    df[max_context_len][batch_size] = 'redundant'
+                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Block size already captured.")
+                    df[max_seq_len][batch_size] = 'redundant'
                     continue
-                logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Constraints met.")
-                df[max_context_len][batch_size] = 'VALID'
-                valid_combinations.append((batch_size, max_context_len))
+                logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Constraints met.")
+                df[max_seq_len][batch_size] = 'VALID'
+                valid_combinations.append((batch_size, max_seq_len))
 
             total_valid_hpugraphs = len(valid_combinations)
             logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.")
-            logger.debug(f"Capture summary (row: batch_size; col: max_context_len):")
+            logger.debug(f"Capture summary (row: batch_size; col: max_seq_len):")
             logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always"))
 
             graph_runner_name = self.graph_runner_class.__name__
@@ -1124,23 +961,28 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
             log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
             log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all
         
-            for idx, (batch_size, max_context_len) in enumerate(pbar): 
-                block_count = math.ceil(max_context_len / self.block_size)
+            for idx, (batch_size, max_seq_len) in enumerate(pbar): 
+                block_count = math.ceil(max_seq_len / self.block_size)
                 # Create dummy attn_metadata.
-                attn_metadata = self.attn_backend.make_metadata(
+                decode_metadata = self.attn_backend.make_metadata(
                     is_prompt=False,
-                    prompt_lens=None,
-                    prompt_lens_tensor=None,
-                    num_prefill_tokens=0,
-                    num_generation_tokens=batch_size,
-                    max_subquery_len=None,
-                    max_context_len=block_count*self.block_size,
-                    max_prompt_len=None,
+                    seq_lens=None,
+                    seq_lens_tensor=context_lens[:batch_size],
+                    max_query_len=None,
+                    max_seq_len=block_count*self.block_size,
                     subquery_start_loc=None,
                     seq_start_loc=None,
-                    context_lens=context_lens[:batch_size],
+                    context_lens_tensor=None, # NOTE(kzawora): this seems sus, shoudn't we have seq_lens tensor here?
                     block_tables=block_tables[:batch_size, :block_count],
                     use_cuda_graph=True,
+                )
+                attn_metadata = AttentionMetadata(
+                    num_prefills=0,
+                    num_prefill_tokens=0,
+                    num_decode_tokens=batch_size,
+                    slot_mapping=slot_mapping[:batch_size],
+                    prefill_metadata=None,
+                    decode_metadata=decode_metadata,
                     kv_cache_dtype=self.kv_cache_dtype,
                 )
 
@@ -1153,7 +995,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                 graph_runner = self.graph_runner_class(self.model)
                 local_start_mem = HabanaMemoryProfiler.current_memory_usage()
                 capture_start = time.time()
-                desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)'
+                desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)'
                 pbar.set_description(desc)
                 logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...")
                 profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation")
@@ -1165,12 +1007,12 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                         attn_metadata,
                     )
                 if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
-                    logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}")
+                    logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}")
                 self.graph_runners[(batch_size, block_count)] = graph_runner
                 capture_end = time.time()
                 local_end_mem = HabanaMemoryProfiler.current_memory_usage()
                 mem_usage_str = format_bytes(local_end_mem - local_start_mem)
-                graph_mem_usage_df[max_context_len][batch_size] = mem_usage_str
+                graph_mem_usage_df[max_seq_len][batch_size] = mem_usage_str
                 logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
 
         end_time = time.perf_counter()
@@ -1178,7 +1020,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         # This usually takes < 10 seconds.
         end_mem = HabanaMemoryProfiler.current_memory_usage()
         logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
-        logger.info(f"Graph memory allocation summary (row: batch_size; col: max_context_len):")
+        logger.info(f"Graph memory allocation summary (row: batch_size; col: max_seq_len):")
         logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always"))
 
     def __del__(self) -> None:
@@ -1312,8 +1154,8 @@ def capture(
             "positions": positions,
             "kv_caches": kv_caches,
             "slot_mapping": attn_metadata.slot_mapping,
-            "context_lens": attn_metadata.context_lens,
-            "block_tables": attn_metadata.block_tables,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
         return
@@ -1324,6 +1166,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        **kwargs,
     ) -> torch.Tensor:
         # KV caches are fixed tensors, so we don't need to copy them.
         del kv_caches
@@ -1333,10 +1176,10 @@ def forward(
         self.input_buffers["positions"].copy_(positions, non_blocking=True)
         self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                  non_blocking=True)
-        self.input_buffers["context_lens"].copy_(attn_metadata.context_lens,
-                                                 non_blocking=True)
-        self.input_buffers["block_tables"].copy_(attn_metadata.block_tables,
-                                                 non_blocking=True)
+        self.input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        self.input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
         # Run the graph.
         self.graph.replay()
 
@@ -1370,7 +1213,7 @@ def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, b
                     num_prefill_tokens=0,
                     num_generation_tokens=attn_metadata.num_generation_tokens,
                     max_subquery_len=None,
-                    max_context_len=attn_metadata.max_context_len,
+                    max_seq_len=attn_metadata.max_seq_len,
                     max_prompt_len=None,
                     subquery_start_loc=None,
                     seq_start_loc=None,
@@ -1436,18 +1279,18 @@ def _get_graph_batch_size(batch_size: int) -> int:
                 _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
 
 
-def _get_graph_max_context_len(max_context_len: int) -> int:
+def _get_graph_max_seq_len(max_seq_len: int) -> int:
     """Returns the padded batch size given actual batch size.
 
     Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
     2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
     """
-    if max_context_len <= 32:
+    if max_seq_len <= 32:
         return 32
-    elif max_context_len <= 64:
+    elif max_seq_len <= 64:
         return 64
-    elif max_context_len <= 128:
+    elif max_seq_len <= 128:
         return 128
     else:
-        return ((max_context_len + _MAX_CONTEXT_LEN_ALIGNMENT - 1) //
-                _MAX_CONTEXT_LEN_ALIGNMENT * _MAX_CONTEXT_LEN_ALIGNMENT)
+        return ((max_seq_len + _MAX_SEQ_LEN_ALIGNMENT - 1) //
+                _MAX_SEQ_LEN_ALIGNMENT * _MAX_SEQ_LEN_ALIGNMENT)

From b5d403780ef615779d10e460acca904cc4206aec Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 9 May 2024 14:11:55 +0300
Subject: [PATCH 004/341] re-enable 8x hpu support

---
 vllm/distributed/communication_op.py          |   3 +-
 vllm/executor/ray_habana_executor.py          | 324 ++++++------------
 .../model_executor/layers/logits_processor.py |   4 +-
 vllm/worker/habana_worker.py                  |   7 +-
 4 files changed, 116 insertions(+), 222 deletions(-)

diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 80d03129bdb9b..7b2905af7e0ab 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -3,6 +3,7 @@
 
 import torch
 from torch.distributed import ProcessGroup
+from vllm.utils import is_hpu
 
 from .parallel_state import (get_cpu_world_group,
                              get_tensor_model_parallel_group,
@@ -156,7 +157,7 @@ def _split_tensor_dict(
             # because it contains not only the device type but also the device
             # index (e.g. "cuda:0"). We only need the device type.
             # receiving side will set the device index.
-            device = "cpu" if value.is_cpu else "cuda"
+            device = "cpu" if value.is_cpu else ("hpu" if is_hpu() else "cuda")
             metadata_list.append(
                 (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index dac8eefb18adc..a17f509f11658 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -1,20 +1,18 @@
 import asyncio
-import copy
 import os
 import pickle
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from itertools import islice, repeat
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig)
-from vllm.engine.ray_utils import RayWorkerVllm, ray
-from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.utils import check_block_size_valid
+import vllm.envs as envs
+from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async)
+                        get_vllm_instance_id, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -24,29 +22,14 @@
 
 logger = init_logger(__name__)
 
-# If the env var is set, it uses the Ray's compiled DAG API
-# which optimizes the control plane overhead.
-# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG
 
 
-class RayHabanaExecutor(ExecutorBase):
+class RayHabanaExecutor(DistributedGPUExecutor):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
+    def _init_executor(self) -> None:
+        assert (not self.speculative_config
+                ), "Speculative decoding not yet supported for RayGPU backend."
 
         assert self.parallel_config.worker_use_ray
         placement_group = self.parallel_config.placement_group
@@ -59,9 +42,6 @@ def __init__(
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
-
         self.forward_dag = None
         if USE_RAY_COMPILED_DAG:
             self.forward_dag = self._compiled_ray_dag()
@@ -77,9 +57,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
-        self.driver_dummy_worker: RayWorkerVllm = None
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
         # The remaining workers are the actual ray actors.
-        self.workers: List[RayWorkerVllm] = []
+        self.workers: List[RayWorkerWrapper] = []
 
         # Create the workers.
         driver_ip = get_ip()
@@ -97,13 +77,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 resources={'HPU': num_gpus},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+            )(RayWorkerWrapper).remote(
+                worker_module_name="vllm.worker.habana_worker",
+                worker_class_name="HabanaWorker",
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
 
             worker_ip = ray.get(worker.get_node_ip.remote())
             if worker_ip == driver_ip and self.driver_dummy_worker is None:
                 # If the worker is on the same node as the driver, we use it
                 # as the resource holder for the driver process.
                 self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name="vllm.worker.habana_worker",
+                    worker_class_name="HabanaWorker",
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
             else:
                 # Else, added to the list of workers.
                 self.workers.append(worker)
@@ -115,201 +104,120 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 "GPU node.")
 
         # Get the set of GPU IDs used on each node.
-        driver_node_id, driver_gpu_ids = ray.get(
-            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
-        worker_node_and_gpu_ids = ray.get(
-            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
 
         node_workers = defaultdict(list)
         node_gpus = defaultdict(list)
 
-        node_workers[driver_node_id].append(0)
-        node_gpus[driver_node_id].extend(driver_gpu_ids)
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
-                                               start=1):
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
             node_gpus[node_id].extend(gpu_ids)
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
 
+        VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+        # Set environment variables for the driver and workers.
+        all_args_to_update_environment_variables = [({
+            "VLLM_INSTANCE_ID":
+            VLLM_INSTANCE_ID,
+            "VLLM_TRACE_FUNCTION":
+            str(envs.VLLM_TRACE_FUNCTION),
+        }, ) for (node_id, _) in worker_node_and_gpu_ids]
+        self._run_workers("update_environment_variables",
+                          all_args=all_args_to_update_environment_variables)
+
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.habana_worker import HabanaWorker
-
-        model_config = copy.deepcopy(self.model_config)
-        parallel_config = copy.deepcopy(self.parallel_config)
-        scheduler_config = copy.deepcopy(self.scheduler_config)
-        device_config = copy.deepcopy(self.device_config)
-        lora_config = copy.deepcopy(self.lora_config)
-        kv_cache_dtype = self.cache_config.cache_dtype
-
-        # Initialize the actual workers with the Worker class.
-        for rank, (worker, (node_id, _)) in enumerate(
-                zip(self.workers, worker_node_and_gpu_ids),
-                start=1,
-        ):
-            local_rank = node_workers[node_id].index(rank)
-            worker.init_worker.remote(
-                lambda rank=rank, local_rank=local_rank: HabanaWorker(
-                    model_config,
-                    parallel_config,
-                    scheduler_config,
-                    device_config,
-                    local_rank,
-                    rank,
-                    distributed_init_method,
-                    lora_config=lora_config,
-                    kv_cache_dtype=kv_cache_dtype,
-                ))
-
-        # Initialize the driver worker with the Worker class.
-        driver_rank = 0
-        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
-        self.driver_worker = HabanaWorker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            driver_local_rank,
-            driver_rank,
-            distributed_init_method,
-            lora_config=self.lora_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=True,
-        )
+        # Initialize the actual workers inside worker wrapper.
+        init_worker_all_kwargs = [
+            self._get_worker_kwargs(
+                local_rank=node_workers[node_id].index(rank),
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+        ]
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
         self._run_workers("init_device")
-        self._run_workers(
-            "load_model",
-            max_concurrent_workers=self.parallel_config.
-            max_parallel_loading_workers,
-        )
-
-    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache.
-
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
-        More details can be found in the
-        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
-        from class :class:`~vllm.worker.Worker`.
-
-        Afterwards, as there may be multiple workers,
-        we take the minimum number of blocks across all workers
-        to ensure this can be applied to all of them.
-
-        Finally, the engine will initialize the KV cache
-        with the calculated number of blocks.
-
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers(
-            "profile_num_available_blocks",
-            block_size=self.cache_config.block_size,
-            hpu_memory_utilization=self.cache_config.gpu_memory_utilization,
-            cpu_swap_space=self.cache_config.swap_space_bytes,
-            cache_dtype=self.cache_config.cache_dtype,
-        )
-
-        # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-        logger.info(f"# HPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
-
-        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
-                               self.model_config.max_model_len)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        # Initialize the cache.
-        self._run_workers("init_cache_engine", cache_config=self.cache_config)
-        # Warm up the model. This includes capturing the model into CUDA graph
-        # if enforce_eager is False.
-        self._run_workers("warm_up_model")
-
-    def execute_model(self,
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      blocks_to_swap_in: Dict[int, int],
-                      blocks_to_swap_out: Dict[int, int],
-                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        self._run_workers("load_model",
+                          max_concurrent_workers=self.parallel_config.
+                          max_parallel_loading_workers)
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         all_outputs = self._run_workers(
             "execute_model",
-            driver_kwargs={
-                "seq_group_metadata_list": seq_group_metadata_list,
-                "blocks_to_swap_in": blocks_to_swap_in,
-                "blocks_to_swap_out": blocks_to_swap_out,
-                "blocks_to_copy": blocks_to_copy,
-            },
+            driver_kwargs={"execute_model_req": execute_model_req},
             use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
 
         # Only the driver worker returns the sampling results.
-        output = all_outputs[0]
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "add_lora",
-            lora_request=lora_request,
-        )
-
-    def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "remove_lora",
-            lora_id=lora_id,
-        )
-
-    def list_loras(self) -> List[int]:
-        return self._run_workers("list_loras")
+        return all_outputs[0]
 
     def _run_workers(
         self,
         method: str,
         *args,
-        driver_args: Optional[List[Any]] = None,
+        driver_args: Optional[Tuple[Any, ...]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
         use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
-        """Runs the given method on all workers."""
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+
+        - args/kwargs: All workers share the same args/kwargs
+        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+          different args
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
 
         if max_concurrent_workers:
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
+        if driver_args is None:
+            driver_args = args if all_args is None else all_args[0]
+        if driver_kwargs is None:
+            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+
         if use_ray_compiled_dag:
             # Right now, compiled DAG can only accept a single
             # input. TODO(sang): Fix it.
+            assert self.forward_dag is not None
             output_channels = self.forward_dag.execute(1)
         else:
             # Start the ray workers first.
             ray_worker_outputs = [
-                worker.execute_method.remote(method, *args, **kwargs)
-                for worker in self.workers
+                worker.execute_method.remote(method, *worker_args,
+                                             **worker_kwargs)
+                for (worker, worker_args, worker_kwargs
+                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
             ]
 
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
         # Start the driver worker after all the ray workers.
-        driver_worker_output = getattr(self.driver_worker,
-                                       method)(*driver_args, **driver_kwargs)
-
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
         # Get the results of the ray workers.
         if self.workers:
             if use_ray_compiled_dag:
@@ -342,8 +250,9 @@ def _compiled_ray_dag(self):
         # a dummy value for now. It will be fixed soon.
         with InputNode() as input_data:
             forward_dag = MultiOutputNode([
-                worker.execute_model_compiled_dag_remote.bind(input_data)
-                for worker in self.workers
+                worker.execute_model_compiled_dag_remote.
+                bind(  # type: ignore[attr-defined]
+                    input_data) for worker in self.workers
             ])
         return forward_dag.experimental_compile()
 
@@ -365,13 +274,17 @@ def _check_if_any_actor_is_dead(self):
                                f"Dead Workers: {dead_actors}. ")
 
 
-class RayHabanaExecutorAsync(RayHabanaExecutor, ExecutorAsyncBase):
+class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_executor = make_async(self.driver_worker.execute_method)
 
     async def _run_workers_async(
         self,
         method: str,
         *args,
-        driver_args: Optional[List[Any]] = None,
+        driver_args: Optional[Tuple[Any, ...]] = None,
         driver_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs,
     ) -> Any:
@@ -383,9 +296,8 @@ async def _run_workers_async(
         if driver_kwargs is None:
             driver_kwargs = kwargs
 
-        # Run the driver worker asynchronously.
-        driver_executor = make_async(getattr(self.driver_worker, method))
-        coros.append(driver_executor(*driver_args, **driver_kwargs))
+        coros.append(
+            self.driver_executor(method, *driver_args, **driver_kwargs))
 
         # Run the ray workers asynchronously.
         for worker in self.workers:
@@ -393,27 +305,3 @@ async def _run_workers_async(
 
         all_outputs = await asyncio.gather(*coros)
         return all_outputs
-
-    async def execute_model_async(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> SamplerOutput:
-        all_outputs = await self._run_workers_async(
-            "execute_model",
-            driver_kwargs={
-                "seq_group_metadata_list": seq_group_metadata_list,
-                "blocks_to_swap_in": blocks_to_swap_in,
-                "blocks_to_swap_out": blocks_to_swap_out,
-                "blocks_to_copy": blocks_to_copy,
-            })
-
-        # Only the driver worker returns the sampling results.
-        output = all_outputs[0]
-        return output
-
-    async def check_health_async(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 605009e8f695c..5e484ff05b2f3 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -50,7 +50,9 @@ def forward(
             # Get the logits for the next tokens.
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
-        if logits is not None: # and sampling_metadata.perform_sampling: FIXME: this is needed for 8xHPU
+        # NOTE(kzawora): allgather on HPU will cause logits to be not None, 
+        # and we need to guard against applying logits processors on non-driver worker
+        if logits is not None and sampling_metadata.seq_groups is not None:
             logits *= self.scale
 
             # Apply logits processors (if any).
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index a05eee90648b2..43ccd235c174f 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -293,10 +293,13 @@ def init_worker_distributed_environment(
             init_method=distributed_init_method,
         )
 
-    # A small all_reduce for warmup.
-    torch.distributed.all_reduce(torch.zeros(1).to('hpu'))
+    # A small all_reduce for warmup & checking conformance.
+    dummy_tensor_hpu = torch.ones(1).to('hpu')
+    torch.distributed.all_reduce(dummy_tensor_hpu)
+    assert dummy_tensor_hpu.item() == parallel_config.world_size
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
+    
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
                                 max_model_len) -> None:
     if num_gpu_blocks <= 0:

From 90dfa92d8e22b2cc6634dbb5df27a6e253b84be1 Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Thu, 9 May 2024 13:14:46 +0200
Subject: [PATCH 005/341] Fix model_output_idx on HPU (#27)

---
 vllm/model_executor/sampling_metadata.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 78b3e6417366e..e2076018b5609 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -192,6 +192,12 @@ def _prepare_seq_groups(
     # Total number of prompts from given sequence groups.
     num_prompts = 0
 
+    # FIXME: On HPU prompts are right-padded. We need to take that into account
+    #        when updating model_output_idx
+    if is_hpu() and len(seq_lens) > 0:
+        assert seq_lens == query_lens, 'Prompt chunking is not yet supported on HPU!'
+        max_seq_len = max(seq_lens)
+
     for i, seq_group_metadata in enumerate(seq_group_metadata_list):
         seq_ids = list(seq_group_metadata.seq_data.keys())
         sampling_params = seq_group_metadata.sampling_params
@@ -219,10 +225,12 @@ def _prepare_seq_groups(
             prompt_logprob_len = (query_len - num_prefill_sample
                                   if do_sample else query_len)
             sample_len = num_prefill_sample if do_sample else 0
+            padding_len = 0 if not is_hpu() else max_seq_len - seq_len
         else:
             # Decode
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
+            padding_len = 0
 
         # Update indices to select from the model output.
         """
@@ -241,6 +249,7 @@ def _prepare_seq_groups(
             selected_token_indices.extend(
                 range(model_output_idx, model_output_idx + sample_len))
         model_output_idx += sample_len
+        model_output_idx += padding_len
 
         # We now find indices for logprob computation and sampling.
         """

From eeef644262f76b4d7af560b88dbbed6946f8c1bd Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Thu, 9 May 2024 13:21:14 +0200
Subject: [PATCH 006/341] Allow block_sizes: 64 and 128 (#28)

---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b718ed9cf393a..a8dcaef0e5754 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -249,7 +249,7 @@ def add_cli_args(
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                             help='Token block size for contiguous chunks of '
                             'tokens.')
 

From 84a46987f9981f96b9032432412c81b967219b2e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 9 May 2024 15:04:33 +0300
Subject: [PATCH 007/341] add triton to requirements-hpu

---
 requirements-hpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 5176cc23cde47..21666eb116c22 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -3,5 +3,6 @@
 
 # Dependencies for HPU code
 ray == 2.9.3
+triton
 pandas
 tabulate
\ No newline at end of file

From 972acf3ccb086568b76e49ca76fbe884f3f0fb7e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 9 May 2024 16:09:37 +0300
Subject: [PATCH 008/341] Fix out-of-bound HPUGraph capture issue

---
 vllm/worker/habana_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e418ccc1d5c62..a8f801d62cc3d 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -496,6 +496,7 @@ def _prepare_decode(
         use_captured_graph = (
             not self.model_config.enforce_eager
             and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+            and max_seq_len <= _MAX_SEQ_LENS_TO_CAPTURE[-1]
             and max_seq_len <= self.max_seq_len_to_capture)
         if use_captured_graph:
             graph_batch_size = _get_graph_batch_size(batch_size)

From 61b77632897928cac86c691080df4f125824a60d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 9 May 2024 16:56:10 +0300
Subject: [PATCH 009/341] fix VLLM_HPU_LOG_STEP_GRAPH_COMPILATION

---
 vllm/executor/habana_executor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index cc035f397aa6d..5c2cc7e958f96 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -96,6 +96,7 @@ def execute_model(
         log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all
         if log_graph_compilation or log_cpu_fallbacks:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
             is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list])
             max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here
             max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1

From fdf282b9e08560e230565b47de68513a89261050 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 15 May 2024 16:40:02 +0300
Subject: [PATCH 010/341] WA: Disable cumsum in HPU _prepare_prompt

---
 vllm/worker/habana_model_runner.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a8f801d62cc3d..e306ef0ae12cb 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -402,15 +402,6 @@ def _prepare_prompt(
                                     dtype=torch.int32,
                                     device=self.device)
 
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=subquery_start_loc.dtype,
-                     out=subquery_start_loc[1:])
-
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             seq_lens=seq_lens,

From ce1670b11156a25db6b992b285aeadc1166df504 Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Tue, 21 May 2024 16:06:26 +0200
Subject: [PATCH 011/341] bs/seq bucketing for prompt and decode (#33)

* Bucketing/Warmup WIP

* Cleanup

* Revert "Fix model_output_idx on HPU (#27)"

This reverts commit 90dfa92d8e22b2cc6634dbb5df27a6e253b84be1.

* Rework selected_token_indices fix to also work with block_size padding

* Simple prompt attention POC

* Remove cumsum

* MQA/GQA support for simple prompt_attention

* Cleanup

* Fix typo

* Restore profiling runs
---
 vllm/attention/backends/habana_attn.py   |  57 +-
 vllm/hpu/xops.py                         |  85 +--
 vllm/model_executor/sampling_metadata.py |   9 -
 vllm/worker/habana_model_runner.py       | 818 +++++------------------
 vllm/worker/habana_worker.py             |  19 +-
 5 files changed, 225 insertions(+), 763 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 909c2ad955f25..45fe1989f9bff 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -2,14 +2,13 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-import importlib
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Type
 
 import torch
+import math
 import vllm.hpu.xops as xops
 from vllm.hpu.attn_bias import (AttentionBias,
-                                BlockDiagonalCausalMask,
                                 LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -18,7 +17,6 @@
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
 from vllm.logger import init_logger
-from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
@@ -119,11 +117,11 @@ def __post_init__(self):
 class HabanaAttentionImpl(AttentionImpl):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prefill_tokens ----------------->|	
+    |<--------------- num_prefill_tokens ----------------->|
     |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
 
-    Otherwise, the layout is as follows:	
-    |<----------------- num_decode_tokens ------------------>|	
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
     |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
 
     Generation tokens can contain padding when cuda-graph is used.
@@ -196,48 +194,37 @@ def forward(
             HabanaPagedAttention.write_to_paged_cache(key, value, key_cache,
                                                       value_cache,
                                                       attn_metadata.slot_mapping,
-                                                      attn_metadata.kv_cache_dtype, 
+                                                      attn_metadata.kv_cache_dtype,
                                                       attn_metadata.prefill_metadata is not None)
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
-                # normal attention.
-                # block tables are empty if the prompt does not have a cached
-                # prefix.
-                if self.num_kv_heads != self.num_heads:
-                    # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-                    # project the key and value tensors to the desired number of
-                    # heads.
-                    # TODO(woosuk): Use MQA/GQA kernels for higher performance.
-                    query = query.view(query.shape[0], self.num_kv_heads,
-                                       self.num_queries_per_kv,
-                                       query.shape[-1])
-                    key = key[:, :,
-                              None, :].expand(key.shape[0], self.num_kv_heads,
-                                              self.num_queries_per_kv,
-                                              key.shape[-1])
-                    value = value[:, :,
-                                  None, :].expand(value.shape[0],
-                                                  self.num_kv_heads,
-                                                  self.num_queries_per_kv,
-                                                  value.shape[-1])
-
+                # TODO: move this outside of model
                 if prefill_meta.attn_bias is None:
                     if self.alibi_slopes is None:
-                        attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                            [seq_len] * batch_size)
+                        lens = torch.tensor(attn_metadata.prefill_metadata.seq_lens, device=query.device, dtype=torch.int32)
+                        len_mask = (torch.arange(0, seq_len, device=query.device, dtype=torch.int32)
+                                    .view(1, seq_len)
+                                    .ge(lens.unsqueeze(-1))
+                                    .view(batch_size, 1, 1, seq_len))
+                        causal_mask = torch.triu(
+                            torch.ones((batch_size, 1, seq_len, seq_len), device=query.device, dtype=torch.bool),
+                            diagonal=1
+                        )
+                        mask = causal_mask.logical_or(len_mask)
+                        attn_bias = (torch.zeros_like(mask, dtype=query.dtype)
+                                     .masked_fill_(mask, -math.inf))
                         if self.sliding_window is not None:
-                            attn_bias = attn_bias.make_local_attention(
-                                self.sliding_window)
+                            raise NotImplementedError("Sliding window is not supported on HPU")
                         prefill_meta.attn_bias = attn_bias
                     else:
                         prefill_meta.attn_bias = _make_alibi_bias(
                             self.alibi_slopes, self.num_kv_heads, batch_size,
                             seq_len, query.dtype)
-                query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size)
-                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
-                out = xops.memory_efficient_attention_forward(
+                query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
+                out = xops.prompt_attention(
                     query.view(query_shape),
                     key.view(kv_shape),
                     value.view(kv_shape),
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
index c9d237744a917..d6404a4872c0d 100644
--- a/vllm/hpu/xops.py
+++ b/vllm/hpu/xops.py
@@ -5,62 +5,37 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
-import habana_frameworks.torch as htorch
 import torch
-import torch.nn.functional as F
-from typing import List, Optional, Tuple, Union
-from .attn_bias import AttentionBias, BlockDiagonalCausalMask
+from typing import Optional
 
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-except ImportError:
-    print("Not using HPU fused scaled dot-product attention kernel.")
-    FusedSDPA = None
+import vllm.hpu.utils
 
-def memory_efficient_attention_forward(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_bias: Optional[torch.Tensor] = None,
-    p: float = 0.0,
-    scale: Optional[float] = None,
-) -> torch.Tensor:
-    assert attn_bias is not None, "Attention mask is required for prompt processing"
-    dim = query.dim()
-    is_causal = isinstance(attn_bias, BlockDiagonalCausalMask)
-    if FusedSDPA and (is_causal or attn_bias is None):
-        bs = query.shape[0]
-        seq_len_q = query.shape[1]
-        seq_len_kv = key.shape[1]
-        heads = query.shape[-2] if dim != 5 else query.shape[-3]
-        attn_groups = 1 if dim != 5 else query.shape[-2]
-        head_dim = query.shape[-1]
-        if dim == 4:
-            # [bs, seq_len, 1, heads, head_dim] -> [bs, heads, seq_len, head_dim]
-            query = query.reshape(bs, seq_len_q, heads, head_dim).permute(0, 2, 1, 3)
-            key = key.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3)
-            value = value.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3)
-        elif dim == 5:
-            # [bs, seq_len, heads, attn_groups, head_dim] -> [bs, heads, attn_groups, seq_len, head_dim]
-            query = query.reshape(bs, seq_len_q, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
-            key = key.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
-            value = value.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) 
-        else:
-            raise ValueError(f"Unsupported attention dimension: {dim}")
-
-        import habana_frameworks.torch.hpu as ht
-        with ht.sdp_kernel(enable_recompute=False):  # (flash_attention_recompute and q_len == 1)):
-            out = FusedSDPA.apply(
-                query, key, value, None, p, is_causal, scale
-            )
-        htorch.core.mark_step()
-        if dim == 4:
-            # [bs, heads, seq_len, head_dim] -> [bs, seq_len, heads, head_dim]
-            out = out.permute(0, 2, 1, 3).reshape(bs, seq_len_q, heads, head_dim)
-        elif dim == 5:
-            # [bs, heads, attn_groups, seq_len, head_dim] -> [bs, seq_len, heads, attn_groups, head_dim] 
-            out = out.permute(0, 3, 1, 2, 4).reshape(bs, seq_len_q, heads, attn_groups, head_dim)
-    else:
-       raise NotImplementedError(f'Only FusedSDPA causal or non-masked attention is supported.\nFusedSDPA support: {FusedSDPA is not None}\nis_causal: {is_causal}\nmask_present: {attn_bias is not None}')
 
-    return out
+@vllm.hpu.utils.with_mark_steps
+def prompt_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_bias: Optional[torch.Tensor] = None,
+        p: float = 0.0,
+        scale: Optional[float] = None,
+) -> torch.Tensor:
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    query_heads = query.size(1)
+    kv_heads = key.size(1)
+    if query_heads != kv_heads:
+        query = query.unflatten(1, (kv_heads, -1))
+        key = key.unflatten(1, (kv_heads, 1))
+        value = value.unflatten(1, (kv_heads, 1))
+        attn_bias = attn_bias.unsqueeze(2)
+    attn_weights = torch.matmul(query * scale, key.transpose(-1, -2))
+    if attn_bias is not None:
+        attn_weights.add_(attn_bias)
+    attn_weights = torch.softmax(attn_weights, dim=-1)
+    attn_weights = torch.matmul(attn_weights, value)
+    if query_heads != kv_heads:
+        attn_weights = attn_weights.flatten(1, 2)
+    attn_weights = attn_weights.transpose(1, 2)
+    return attn_weights
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index e2076018b5609..78b3e6417366e 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -192,12 +192,6 @@ def _prepare_seq_groups(
     # Total number of prompts from given sequence groups.
     num_prompts = 0
 
-    # FIXME: On HPU prompts are right-padded. We need to take that into account
-    #        when updating model_output_idx
-    if is_hpu() and len(seq_lens) > 0:
-        assert seq_lens == query_lens, 'Prompt chunking is not yet supported on HPU!'
-        max_seq_len = max(seq_lens)
-
     for i, seq_group_metadata in enumerate(seq_group_metadata_list):
         seq_ids = list(seq_group_metadata.seq_data.keys())
         sampling_params = seq_group_metadata.sampling_params
@@ -225,12 +219,10 @@ def _prepare_seq_groups(
             prompt_logprob_len = (query_len - num_prefill_sample
                                   if do_sample else query_len)
             sample_len = num_prefill_sample if do_sample else 0
-            padding_len = 0 if not is_hpu() else max_seq_len - seq_len
         else:
             # Decode
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
-            padding_len = 0
 
         # Update indices to select from the model output.
         """
@@ -249,7 +241,6 @@ def _prepare_seq_groups(
             selected_token_indices.extend(
                 range(model_output_idx, model_output_idx + sample_len))
         model_output_idx += sample_len
-        model_output_idx += padding_len
 
         # We now find indices for logprob computation and sampling.
         """
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e306ef0ae12cb..995864e3f81e7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -2,60 +2,77 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-import contextlib
 import time
 from enum import IntEnum
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple
-
-# for logging hpugraph capture
-import tqdm
-import pandas as pd
-import tabulate
+from typing import List, NamedTuple, Optional, Set, Tuple, Dict
 
 import os
-import contextlib
 import math
 import itertools
-import numpy as np
+import operator
 import torch
-import torch.nn as nn
 import habana_frameworks.torch as htorch
-from habana_frameworks.torch.hpu.metrics import metric_localcontext
 
 from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
                             get_attn_backend)
-from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.distributed.device_communicators import custom_all_reduce
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d,
-                        is_pin_memory_available, make_tensor_with_pad,
-                        maybe_expand_dim, pad_to_max_length, format_bytes)
+from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available,
+                        make_tensor_with_pad, format_bytes)
 
 logger = init_logger(__name__)
 
-_PAD_SLOT_ID = -1
+_PAD_SLOT_ID = 0
 LORA_WARMUP_RANK = 8
-_BATCH_SIZE_ALIGNMENT = 16
-# Capture graphs for token size 1, 2, 4, 8, 16, 32, 48, ..., 512.
-# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4, 8] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
-]
 
-# Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048
-_MAX_SEQ_LEN_ALIGNMENT = 256
-_MAX_SEQ_LENS_TO_CAPTURE = [1, 32, 64, 128] + [
-    _MAX_SEQ_LEN_ALIGNMENT * i for i in range(1, 9)
-]
+
+# Read bucketing configuration from env variables
+# phase is either 'prompt' or 'decode'
+# dim is either 'bs' or 'seq'
+# example env variable: VLLM_DECODE_BS_STEP=128
+def read_bucket_settings(phase: str, dim: str, **defaults: Dict):
+    params = ['min', 'step', 'max']
+    values = [os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p]) for p in params]
+    return values
+
+
+def warmup_buckets(config: Tuple[int, int, int]):
+    bmin, bstep, bmax = config
+    base = itertools.repeat(2)
+    ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin)
+    ramp_up = itertools.takewhile(lambda x: x < bstep and x <= bmax, ramp_up)
+    stable = range(bstep, bmax + 1, bstep)
+    return list(ramp_up) + list(stable)
+
+
+def next_pow2(value: int):
+    res = 1
+    while value > 1:
+        value = (value + 1) // 2
+        res *= 2
+    return res
+
+
+def round_up(value: int, k: int):
+    return (value + k - 1) // k * k
+
+
+def find_bucket(value: int, config: Tuple[int, int, int]):
+    bmin, bstep, bmax = config
+    if value < bstep:
+        result = min(next_pow2(value), bstep)
+    else:
+        result = round_up(value, bstep)
+    return result
 
 
 class PreparePromptMetadata(NamedTuple):
@@ -127,6 +144,7 @@ def __init__(
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
         load_config: LoadConfig,
+        cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
@@ -139,22 +157,16 @@ def __init__(
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
 
-        # model_config can be None in tests/samplers/test_sampler.py.
-        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)
         self.device_config = (device_config
                               if device_config is not None else DeviceConfig())
         self.device = self.device_config.device
 
-        # Set after load_model.
-        self.lora_manager: LRUCacheWorkerLoRAManager = None
-
-        self.graph_runner_class = HPUGraphRunner
-        self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {}
-
-        self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture
-                                       if self.model_config is not None else 0)
+        self.max_num_seqs = self.scheduler_config.max_num_seqs
+        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        self.block_size = cache_config.block_size
 
         self.pin_memory = is_pin_memory_available()
         self.kv_cache_dtype = kv_cache_dtype
@@ -164,16 +176,11 @@ def __init__(
             self.model_config.dtype if model_config is not None else None)
 
         # Lazy initialization
-        self.model: torch.nn.Module  # Set after load_model
-        self.block_size: int  # Set after initial profiling.
-        # When using CUDA graph, the input block tables must be padded to
-        # max_seq_len_to_capture. However, creating the block table in
-        # Python can be expensive. To optimize this, we cache the block table
-        # in numpy and only copy the actual input content at every iteration.
-        # The shape of the cached block table will be
-        # (max batch size to capture, max context len to capture / block size).
-        self.graph_block_tables: torch.Tensor  # Set after initial profiling.
+        self.lora_manager: LRUCacheWorkerLoRAManager = None
+        self.model: torch.nn.Module = None
+        self.excluded_from_warmup = []
 
+        self._setup_buckets()
 
     def load_model(self) -> None:
         with HabanaMemoryProfiler() as m:
@@ -207,16 +214,18 @@ def load_model(self) -> None:
                 self.model.embedding_padding_modules)
             self.model = self.lora_manager.create_lora_manager(self.model)
 
-    def set_block_size(self, block_size: int) -> None:
-        self.block_size = block_size
-
-        self.graph_block_tables = np.zeros(
-            (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
-            dtype=np.int32)
-
-    def get_max_block_per_batch(self) -> int:
-        block_size = self.block_size
-        return (self.max_seq_len_to_capture + block_size - 1) // block_size
+    def _setup_buckets(self) -> None:
+        self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64))
+        self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs)
+        self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024)
+        self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048)
+        logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}")
+        logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}")
+
+        # FIXME: exclude from warmup as it causes OOM on llama-70b
+        self.excluded_from_warmup = [
+            (64, 1024, True)
+        ]
 
     def _prepare_prompt(
         self,
@@ -285,8 +294,6 @@ def _prepare_prompt(
 
             # actual prompt lens
             context_lens.append(context_len)
-            if context_len != 0:
-                import pdb; pdb.set_trace() # what happens if we hit that path??
             query_lens.append(seq_len - context_len)
 
             input_tokens.append(prompt_tokens)
@@ -357,34 +364,31 @@ def _prepare_prompt(
             multi_modal_input = None
 
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
-        max_prompt_len = max(seq_lens)
+        max_prompt_len = max(find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size)
+
         input_tokens = make_tensor_with_pad(input_tokens,
-                                             max_prompt_len,
-                                             pad=0,
-                                             dtype=torch.long,
-                                             device=self.device)
-        
+                                            max_prompt_len,
+                                            pad=0,
+                                            dtype=torch.long,
+                                            device=self.device)
+
         input_positions = make_tensor_with_pad(input_positions,
-                                                max_prompt_len,
-                                                pad=0,
-                                                dtype=torch.long,
-                                                device=self.device)
-        
+                                               max_prompt_len,
+                                               pad=0,
+                                               dtype=torch.long,
+                                               device=self.device)
+
         slot_mapping = make_tensor_with_pad(slot_mapping,
-                                             max_prompt_len,
-                                             pad=_PAD_SLOT_ID,
-                                             dtype=torch.long,
-                                             device=self.device)
+                                            max_prompt_len,
+                                            pad=_PAD_SLOT_ID,
+                                            dtype=torch.long,
+                                            device=self.device)
 
-        # Prepare prefix block tables
-        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
-        block_tables = make_tensor_with_pad(
-            prefix_block_tables,
-            max_len=max_prompt_block_table_len,
-            pad=0,
-            dtype=torch.int,
-            device=self.device,
-        )
+        block_tables = make_tensor_with_pad(prefix_block_tables,
+                                            max_len=max_prompt_block_table_len,
+                                            pad=0,
+                                            dtype=torch.int,
+                                            device=self.device)
 
         # Query length can be shorter than key (i.e., prompt) when prefill
         # is chunked or prefix cached.
@@ -394,7 +398,6 @@ def _prepare_prompt(
         subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
                                          dtype=torch.int32,
                                          device=self.device)
-
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.long,
                                        device=self.device)
@@ -426,6 +429,7 @@ def _prepare_prompt(
             multi_modal_input=multi_modal_input,
             slot_mapping=slot_mapping,
         )
+
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -479,28 +483,7 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        # vLLM uses cuda graph only for decoding requests.
-        # See `capture_model` API for more details.
-        # For decoding requests, batch_size == input_tokens.
-        batch_size = len(input_tokens)
         max_seq_len = max(seq_lens)
-        use_captured_graph = (
-            not self.model_config.enforce_eager
-            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
-            and max_seq_len <= _MAX_SEQ_LENS_TO_CAPTURE[-1]
-            and max_seq_len <= self.max_seq_len_to_capture)
-        if use_captured_graph:
-            graph_batch_size = _get_graph_batch_size(batch_size)
-            assert graph_batch_size >= batch_size
-            for _ in range(graph_batch_size - batch_size):
-                input_tokens.append([0])
-                input_positions.append([0])
-                slot_mapping.append([_PAD_SLOT_ID])
-                seq_lens.append(1)
-                block_tables.append([])
-                lora_index_mapping.append(0)
-            batch_size = graph_batch_size
-
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -514,33 +497,15 @@ def _prepare_decode(
                                        dtype=torch.int,
                                        device=self.device)
 
-        if use_captured_graph:
-            # When using cuda-graph all these tensors should be
-            # padded.
-            assert seq_lens_tensor.shape[0] == len(input_tokens)
-            assert seq_lens_tensor.shape[0] == len(input_positions)
-            assert seq_lens_tensor.shape[0] == len(slot_mapping)
-
-            # The shape of graph_block_tables is
-            # [max batch size, max context len // block size].
-            graph_max_seq_len  = _get_graph_max_seq_len(max_seq_len)
-            assert graph_max_seq_len >= max_seq_len
-            graph_block_count = math.ceil(graph_max_seq_len / self.block_size)
-            input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count]
-            for i, block_table in enumerate(block_tables):
-                if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
-            block_tables = torch.tensor(input_block_tables, device=self.device)
-        else:
-            max_block_table_len = max(
-                len(block_table) for block_table in block_tables)
-            block_tables = make_tensor_with_pad(
-                block_tables,
-                max_len=max_block_table_len,
-                pad=0,
-                dtype=torch.int,
-                device=self.device,
-            )
+        max_block_table_len = max(
+            len(block_table) for block_table in block_tables)
+        block_tables = make_tensor_with_pad(
+            block_tables,
+            max_len=max_block_table_len,
+            pad=0,
+            dtype=torch.int,
+            device=self.device,
+        )
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             seq_lens=None,
@@ -551,7 +516,7 @@ def _prepare_decode(
             seq_start_loc=None,
             context_lens_tensor=None,
             block_tables=block_tables,
-            use_cuda_graph=use_captured_graph,
+            use_cuda_graph=False,
         )
         return PrepareDecodeMetadata(
             input_tokens=input_tokens,
@@ -563,7 +528,6 @@ def _prepare_decode(
             slot_mapping=slot_mapping,
         )
 
-
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
@@ -611,7 +575,7 @@ def prepare_input_tensors(
             num_prefill_tokens = len(input_tokens)
             num_decode_tokens = len(decode_input_tokens)
 
-            # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. 
+            # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing.
             assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!"
             if num_decode_tokens > 0:
                 input_tokens = decode_input_tokens
@@ -621,6 +585,14 @@ def prepare_input_tensors(
                 lora_prompt_mapping = decode_lora_prompt_mapping
                 lora_requests = decode_lora_requests
 
+            # FIXME: We need to adjust selected_token_indices to accomodate for padding
+            max_len = input_tokens.size(1)
+            paddings = [max_len - s for s in seq_lens]
+            paddings = [0] + paddings[:-1]
+            paddings = list(itertools.accumulate(paddings))
+            paddings = torch.tensor(paddings, dtype=sampling_metadata.selected_token_indices.dtype, device=sampling_metadata.selected_token_indices.device)
+            sampling_metadata.selected_token_indices.add_(paddings)
+
             if self.lora_config:
                 lora_mapping = LoRAMapping(
                     lora_index_mapping,
@@ -629,9 +601,6 @@ def prepare_input_tensors(
             else:
                 lora_mapping = None
 
-            # Broadcast the metadata.
-            # If batch contains both prefill and decode, it sends 2 broadcasts.
-            # If it only contains 1 type, it triggers a single broadcast.
             if (prefill_attn_metadata is not None
                     and decode_attn_metadata is not None):
                 batch_type = BatchType.MIXED
@@ -721,13 +690,19 @@ def prepare_input_tensors(
                 sampling_metadata, lora_requests, lora_mapping,
                 multi_modal_input)
 
-
     @torch.inference_mode()
     def execute_model(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
+        if self.is_driver_worker:
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            real_batch_size = len(seq_group_metadata_list)
+            bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg
+            batch_size_padding = find_bucket(real_batch_size, bucket_cfg) - real_batch_size
+            seq_group_metadata_list = seq_group_metadata_list.copy()
+            seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding))
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
          lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
@@ -735,17 +710,6 @@ def execute_model(
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
 
-        # Currently HPU graph is only supported by the decode phase.
-        prefill_meta = attn_metadata.prefill_metadata
-        decode_meta = attn_metadata.decode_metadata
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            graph_batch_size = input_tokens.shape[0]
-            graph_block_count = decode_meta.block_tables.shape[1] 
-            graph_runner_key = (graph_batch_size, graph_block_count)
-            model_executable = self.graph_runners[graph_runner_key]
-            logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(decode_meta.seq_lens_tensor).item()})")
-        else:
-            model_executable = self.model
         execute_model_kwargs = {
             "input_ids": input_tokens,
             "positions": input_positions,
@@ -754,11 +718,14 @@ def execute_model(
         }
         if self.vision_language_config:
             execute_model_kwargs.update({"image_input": multi_modal_input})
-        hidden_states = model_executable(**execute_model_kwargs)
+
+        htorch.core.mark_step()
+        hidden_states = self.model(**execute_model_kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        htorch.core.mark_step()
 
         # Only perform sampling in the driver worker.
         if not self.is_driver_worker:
@@ -769,520 +736,63 @@ def execute_model(
             logits=logits,
             sampling_metadata=sampling_metadata,
         )
-
+        output.outputs = output.outputs[:real_batch_size]
+        htorch.core.mark_step()
         return output
 
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-
-        # This represents the maximum number of different requests
-        # that will have unique loras, an therefore the max amount of memory
-        # consumption create dummy lora request copies from the lora request
-        # passed in, which contains a lora from the lora warmup path.
-        dummy_lora_requests = []
-        dummy_lora_requests_per_seq = []
-        if self.lora_config:
-            for idx in range(self.lora_config.max_loras):
-                lora_id = idx + 1
-                dummy_lora_request = LoRARequest(
-                    lora_name=f"warmup_{lora_id}",
-                    lora_int_id=lora_id,
-                    lora_local_path="/not/a/real/path",
-                )
-                self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                 rank=LORA_WARMUP_RANK)
-                dummy_lora_requests.append(dummy_lora_request)
-            dummy_lora_requests_per_seq = [
-                dummy_lora_requests[idx % len(dummy_lora_requests)]
-                for idx in range(max_num_seqs)
-            ]
-
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        seqs: List[SequenceGroupMetadata] = []
-        # Additional GPU memory may be needed for vision encoding, which needs
-        # to be accounted for when calculating the GPU blocks for
-        # vLLM blocker manager.
-        # To exercise the worst scenario for GPU memory consumption,
-        # the number of seqs (batch_size) is chosen to maximize the number
-        # of images processed.
-        if self.vision_language_config:
-            max_num_seqs = min(
-                max_num_seqs,
-                int(max_num_batched_tokens /
-                    self.vision_language_config.image_feature_size))
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data = SequenceData([0] * seq_len)
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-                lora_request=dummy_lora_requests_per_seq[group_id]
-                if dummy_lora_requests_per_seq else None,
-            )
-            seqs.append(seq)
+    def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
+        sampling_params = SamplingParams(temperature=0)
+        num_blocks = math.ceil(seq_len / self.block_size)
+        if is_prompt:
+            input_len = seq_len
+            output_len = 0
+            block_tables = None
+        else:
+            input_len = seq_len - 1
+            output_len = 1
+            block_tables = {group_id: [0] * num_blocks}
+        prompt_token_ids = [0] * input_len
+        output_token_ids = [1] * output_len
+        seq_data = SequenceData(prompt_token_ids)
+        seq_data.output_token_ids = output_token_ids
+        return SequenceGroupMetadata(
+            request_id=str(group_id),
+            is_prompt=(output_len == 0),
+            seq_data={group_id: seq_data},
+            sampling_params=sampling_params,
+            block_tables=block_tables,
+        )
 
-        # Run the model with the dummy inputs.
+    def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        self.execute_model(seqs, kv_caches)
+        seq_len = self.max_model_len // self.max_num_seqs
+        self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches)
+
+    def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
+        seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)]
+        _ = self.execute_model(seqs, kv_caches)
         torch.hpu.synchronize()
-        return
-
-    def remove_all_loras(self):
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_loras()
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_loras(lora_requests, lora_mapping)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_loras()
 
     @torch.inference_mode()
-    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
-        """Cuda graph capture a model.
-
-        Note that CUDA graph's performance gain is negligible if number
-        of batched tokens are larger than 200. And since CUDA graph
-        requires fixed sized tensors, supporting large/variable batch
-        size requires high GPU memory overhead. Thus, vLLM only captures
-        decoding requests. Mixed batch (chunked prefill + decoding) or
-        prefill requests are not captured.
-
-        Since it is used for decoding-only, it assumes there's only 1 token
-        per sequence in the batch.
-        """
-        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
-        # deleted before the CUDA graphs.
-
-        assert not self.model_config.enforce_eager
-        logger.info("Capturing the model for HPUGraphs. This may lead to "
-                    "unexpected consequences if the model is not static. To "
-                    "run the model in eager mode, set 'enforce_eager=True' or "
-                    "use '--enforce-eager' in the CLI.")
-        logger.info("HPUGraphs can take additional ~10 GiB memory per HPU. "
-                    "If you are running out of memory, consider decreasing "
-                    "`gpu_memory_utilization` or enforcing eager mode. "
-                    "You can also reduce the `max_num_seqs` as needed "
-                    "to decrease memory usage.")
+    def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        times = 1  # TODO: this is will be updated once HPU graphs are reintroduced
+        scenarios = []
+        scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False]))
+        scenarios.extend(itertools.product(warmup_buckets(self.prompt_bs_bucket_cfg), warmup_buckets(self.prompt_seq_bucket_cfg), [True]))
+        scenarios = [scenario for scenario in reversed(scenarios) for _ in range(times) if scenario not in self.excluded_from_warmup]
+
+        start_mem = HabanaMemoryProfiler.current_memory_usage()
         start_time = time.perf_counter()
-
-        # Prepare dummy inputs. These will be reused for all batch sizes.
-        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu')
-        input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu')
-        slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') # TODO(kzawora): when using torch.empty, following occurs: RuntimeError: Error when trying to cast Long to Int, Input values range [0, 139632108750000] exceeds Int range [-2147483648, 2147483647]
-        slot_mapping.fill_(_PAD_SLOT_ID)
-        context_lens = torch.ones(max_batch_size, dtype=torch.int32).to('hpu')
-        block_tables = torch.from_numpy(self.graph_block_tables).to('hpu')
-
-        graph_batch_size = _get_graph_batch_size(
-            self.scheduler_config.max_num_seqs)
-        batch_size_capture_list = [
-            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size
-        ]
-
-        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
-        # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
-        # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
-        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
-        # We always prioritize using custom all-reduce kernel but fall back
-        # to PyTorch or CuPy NCCL if it is disabled or not supported.
-        with custom_all_reduce.capture():
-            # NOTE: Capturing the largest batch size first may help reduce the
-            # memory usage of CUDA graph.
-            valid_combinations = []
-            total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_SEQ_LENS_TO_CAPTURE)
-            import pandas as pd
-            df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_SEQ_LENS_TO_CAPTURE)
-            for idx, (batch_size, max_seq_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_SEQ_LENS_TO_CAPTURE))): 
-                block_count = math.ceil(max_seq_len / self.block_size)
-                # Skip capture of "out-of-bound" batch sizes and context lengths
-                if batch_size > self.scheduler_config.max_num_seqs:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Batch out of bound.")
-                    df[max_seq_len][batch_size] = 'batch OoB'
-                    continue 
-                if max_seq_len > self.max_seq_len_to_capture:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Nax context length out of bound.")
-                    df[max_seq_len][batch_size] = 'ctx OoB'
-                    continue
-                block_count = math.ceil(max_seq_len / self.block_size)
-                captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size]
-                if block_count in captured_block_counts:
-                    logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Block size already captured.")
-                    df[max_seq_len][batch_size] = 'redundant'
-                    continue
-                logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Constraints met.")
-                df[max_seq_len][batch_size] = 'VALID'
-                valid_combinations.append((batch_size, max_seq_len))
-
-            total_valid_hpugraphs = len(valid_combinations)
-            logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.")
-            logger.debug(f"Capture summary (row: batch_size; col: max_seq_len):")
-            logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always"))
-
-            graph_runner_name = self.graph_runner_class.__name__
-            graph_mem_usage_df = pd.DataFrame(index=list(reversed(sorted({b for b,c in valid_combinations}))), columns=list(reversed(sorted({c for b,c in valid_combinations}))))
-            pbar = tqdm.tqdm(valid_combinations)
-            start_mem = HabanaMemoryProfiler.current_memory_usage()
-            log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
-            log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all
-        
-            for idx, (batch_size, max_seq_len) in enumerate(pbar): 
-                block_count = math.ceil(max_seq_len / self.block_size)
-                # Create dummy attn_metadata.
-                decode_metadata = self.attn_backend.make_metadata(
-                    is_prompt=False,
-                    seq_lens=None,
-                    seq_lens_tensor=context_lens[:batch_size],
-                    max_query_len=None,
-                    max_seq_len=block_count*self.block_size,
-                    subquery_start_loc=None,
-                    seq_start_loc=None,
-                    context_lens_tensor=None, # NOTE(kzawora): this seems sus, shoudn't we have seq_lens tensor here?
-                    block_tables=block_tables[:batch_size, :block_count],
-                    use_cuda_graph=True,
-                )
-                attn_metadata = AttentionMetadata(
-                    num_prefills=0,
-                    num_prefill_tokens=0,
-                    num_decode_tokens=batch_size,
-                    slot_mapping=slot_mapping[:batch_size],
-                    prefill_metadata=None,
-                    decode_metadata=decode_metadata,
-                    kv_cache_dtype=self.kv_cache_dtype,
-                )
-
-                if self.lora_config:
-                    lora_mapping = LoRAMapping(
-                        [0] * batch_size,
-                        [0] * batch_size,
-                    )
-                    self.set_active_loras(set(), lora_mapping)
-                graph_runner = self.graph_runner_class(self.model)
-                local_start_mem = HabanaMemoryProfiler.current_memory_usage()
-                capture_start = time.time()
-                desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)'
-                pbar.set_description(desc)
-                logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...")
-                profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation")
-                with profiling_ctx as gc_local_metric:
-                    graph_runner.capture(
-                        input_tokens[:batch_size],
-                        input_positions[:batch_size],
-                        kv_caches,
-                        attn_metadata,
-                    )
-                if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
-                    logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}")
-                self.graph_runners[(batch_size, block_count)] = graph_runner
-                capture_end = time.time()
-                local_end_mem = HabanaMemoryProfiler.current_memory_usage()
-                mem_usage_str = format_bytes(local_end_mem - local_start_mem)
-                graph_mem_usage_df[max_seq_len][batch_size] = mem_usage_str
-                logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
-
+        for i, (batch_size, seq_len, is_prompt) in enumerate(scenarios):
+            mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory()
+            logger.info(f"[Warmup][{i+1}/{len(scenarios)}] batch_size:{batch_size} seq_len:{seq_len} is_prompt:{is_prompt} mem_usage:{mem_usage:0.1f}%")
+            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
         end_time = time.perf_counter()
-        elapsed_time = end_time - start_time
-        # This usually takes < 10 seconds.
         end_mem = HabanaMemoryProfiler.current_memory_usage()
-        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
-        logger.info(f"Graph memory allocation summary (row: batch_size; col: max_seq_len):")
-        logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always"))
-
-    def __del__(self) -> None:
-        # Delete the CUDA graphs before deleting the CuPy NCCL communicator.
-        # NOTE(woosuk): This is necessary because otherwise deadlocks can
-        # happen.
-        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
-        self.graph_runners.clear()
+        elapsed_time = end_time - start_time
+        logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory")
 
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
-
-
-class FakeHPUGraphRunner:
-
-    def __init__(self, model: nn.Module):
-        self.model = model
-
-    def capture(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> None:
-        return
-    
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        return self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-        )
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-
-class FakeHPUGraphRunnerWithWarmup:
-
-    def __init__(self, model: nn.Module):
-        self.model = model
-
-    def capture(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> None:
-        htorch.core.mark_step()
-        out = self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-        )
-        htorch.core.mark_step()
-        htorch.hpu.synchronize()
-        return
-    
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        htorch.core.mark_step()
-        out = self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-        )
-        htorch.core.mark_step()
-        return out
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-class HPUGraphRunner:
-
-    def __init__(self, model: nn.Module):
-        self.model = model
-        self.graph = None
-        self.input_buffers: Dict[str, torch.Tensor] = {}
-        self.output_buffers: Dict[str, torch.Tensor] = {}
-
-    def capture(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> None:
-        assert self.graph is None
-        # Run the model once without capturing the graph.
-        # This is to make sure that the captured graph does not include the
-        # kernel launches for initial benchmarking (e.g., Triton autotune).
-        self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-        )
-        htorch.hpu.synchronize()
-
-        # Capture the graph.
-        # NOTE(woosuk): Python 3.8 does not support multi-line with statements.
-        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
-        self.graph = htorch.hpu.HPUGraph()
-        with htorch.hpu.graph(self.graph):  # noqa: SIM117
-            hidden_states = self.model(
-                input_ids,
-                positions,
-                kv_caches,
-                attn_metadata,
-            )
-        torch.hpu.synchronize()
-
-        # Save the input and output buffers.
-        self.input_buffers = {
-            "input_ids": input_ids,
-            "positions": positions,
-            "kv_caches": kv_caches,
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-        }
-        self.output_buffers = {"hidden_states": hidden_states}
-        return
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        **kwargs,
-    ) -> torch.Tensor:
-        # KV caches are fixed tensors, so we don't need to copy them.
-        del kv_caches
-
-        # Copy the input tensors to the input buffers.
-        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
-        self.input_buffers["positions"].copy_(positions, non_blocking=True)
-        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
-                                                 non_blocking=True)
-        self.input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        self.input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        # Run the graph.
-        self.graph.replay()
-
-        # Return the output tensor.
-        return self.output_buffers["hidden_states"]
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-
-class ExperimentalHPUGraphRunner:
-    def __init__(self, model: nn.Module):
-        self.model = model
-
-    def capture(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> None:
-        class ModelWrapper(torch.nn.Module):
-            def __init__(self, model):
-                super().__init__()
-                self.model = model
-                self.attn_backend = get_attn_backend(torch.bfloat16)
-            def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables):
-                wrapper_attn_metadata = self.attn_backend.make_metadata(
-                    is_prompt=attn_metadata.is_prompt,
-                    seq_lens=None,
-                    seq_lens_tensor=None, 
-                    num_prefill_tokens=0,
-                    num_generation_tokens=attn_metadata.num_generation_tokens,
-                    max_subquery_len=None,
-                    max_seq_len=attn_metadata.max_seq_len,
-                    max_prompt_len=None,
-                    subquery_start_loc=None,
-                    seq_start_loc=None,
-                    context_lens=context_lens,
-                    block_tables=block_tables,
-                    use_cuda_graph=True,
-                    kv_cache_dtype=attn_metadata.kv_cache_dtype,
-                )
-                return self.model(
-                    input_ids,
-                    positions,
-                    kv_caches,
-                    wrapper_attn_metadata
-                )
-        self.graph_model = htorch.hpu.wrap_in_hpu_graph(ModelWrapper(self.model))
-        out = self.graph_model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata.slot_mapping,
-            attn_metadata.context_lens, 
-            attn_metadata.block_tables,
-        )
-        htorch.hpu.synchronize()
-        return
-    
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        out = self.graph_model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata.slot_mapping,
-            attn_metadata.context_lens, 
-            attn_metadata.block_tables,
-        )
-        return out
-
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-
-
-def _get_graph_batch_size(batch_size: int) -> int:
-    """Returns the padded batch size given actual batch size.
-
-    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-    """
-    if batch_size <= 2:
-        return batch_size
-    elif batch_size <= 4:
-        return 4
-    elif batch_size <= 8:
-        return 8
-    else:
-        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
-                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-
-
-def _get_graph_max_seq_len(max_seq_len: int) -> int:
-    """Returns the padded batch size given actual batch size.
-
-    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
-    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
-    """
-    if max_seq_len <= 32:
-        return 32
-    elif max_seq_len <= 64:
-        return 64
-    elif max_seq_len <= 128:
-        return 128
-    else:
-        return ((max_seq_len + _MAX_SEQ_LEN_ALIGNMENT - 1) //
-                _MAX_SEQ_LEN_ALIGNMENT * _MAX_SEQ_LEN_ALIGNMENT)
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 43ccd235c174f..eeba9e5c4adba 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -73,13 +73,14 @@ def __init__(
             assert False, "To be tested: vision language model on HPU"
 
         self.model_runner = HabanaModelRunner(model_config,
-                                        parallel_config,
-                                        scheduler_config,
-                                        device_config,
-                                        load_config=load_config,
-                                        lora_config=self.lora_config,
-                                        kv_cache_dtype=self.cache_config.cache_dtype,
-                                        is_driver_worker=is_driver_worker)
+                                              parallel_config,
+                                              scheduler_config,
+                                              device_config,
+                                              load_config=load_config,
+                                              cache_config=cache_config,
+                                              lora_config=self.lora_config,
+                                              kv_cache_dtype=self.cache_config.cache_dtype,
+                                              is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: CacheEngine
@@ -168,12 +169,10 @@ def _init_cache_engine(self) -> None:
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
                                         self.parallel_config)
         self.hpu_cache = self.cache_engine.gpu_cache
-        self.model_runner.set_block_size(self.cache_engine.block_size)
         htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution
 
     def _warm_up_model(self) -> None:
-        if not self.model_config.enforce_eager:
-            self.model_runner.capture_model(self.hpu_cache)
+        self.model_runner.warmup_model(self.hpu_cache)
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 14d294d885296f44e2bef3fa7b1b512654fd69a9 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 22 May 2024 10:43:47 +0200
Subject: [PATCH 012/341] Cleanup: Fix HPU auto-detection in setup.py (#34)

* Fix HPU auto-detection in setup.py

* Update setup.py
---
 setup.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index f0364cc7a5893..49e20aac0068a 100644
--- a/setup.py
+++ b/setup.py
@@ -203,12 +203,11 @@ def build_extensions(self) -> None:
 
 
 def _is_hpu() -> bool:
-    return True 
     is_hpu_available = True
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists('/dev/accel/accel_controlD0'):
             is_hpu_available = False
     return is_hpu_available
 

From f6fb119ca85ddbd280b0c35527cf378a503b1c00 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 22 May 2024 10:43:56 +0200
Subject: [PATCH 013/341] Restore int64 sampling (#35)

---
 vllm/model_executor/sampling_metadata.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 78b3e6417366e..9969c45963e9a 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -8,7 +8,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        maybe_expand_dim, is_hpu)
+                        maybe_expand_dim)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
@@ -501,19 +501,19 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sample_indices_t = torch.tensor(
             sample_indices,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         prompt_tensor = torch.tensor(
             prompt_padded_tokens,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         output_tensor = torch.tensor(
             output_padded_tokens,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         )
         # need to transpose and make contiguous to
@@ -522,7 +522,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sampling_seeds_t = torch.tensor(
             sampling_seeds,
             device="cpu",
-            dtype=torch.int,
+            dtype=torch.long,
             pin_memory=pin_memory,
         ).T.contiguous()
 
@@ -571,7 +571,7 @@ def _get_sequence_seeds(
             else:
                 generator = random.Random(str((seed, ) + extra_entropy))
                 randint_fn = generator.randint
-            lo, hi = torch.iinfo(torch.int).min, torch.iinfo(torch.int).max
+            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
             # If the user/random sets seed = 0 but request should
             # have sampling, we need to change it to something
             # else. We use a constant in that case.

From 78b0513b3e4ac7be9082dbda4cfef5bf3cd05e97 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 22 May 2024 10:44:05 +0200
Subject: [PATCH 014/341] Llama whitespace fix (#36)

---
 vllm/model_executor/models/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 4f766c0d6b366..f6d7fc8733fce 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -236,6 +236,7 @@ def forward(
             kv_cache=kv_cache,
             attn_metadata=attn_metadata,
         )
+
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)

From 09c1eb246d7c97b3d082f83b98d173a481573c6e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 22 May 2024 10:44:13 +0200
Subject: [PATCH 015/341] Restore pyproject.toml (#37)

---
 pyproject.toml | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..6a448defc16e1
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,67 @@
+[build-system]
+# Should be mirrored in requirements-build.txt
+requires = [
+    "cmake>=3.21",
+    "ninja",
+    "packaging",
+    "setuptools >= 49.4.0",
+    "torch == 2.3.0",
+    "wheel",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+exclude = [
+    # External file, leaving license intact
+    "examples/fp8/quantizer/quantize.py"
+]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    # "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    # "I",
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+]
+
+[tool.mypy]
+python_version = "3.8"
+
+ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "skip"
+
+files = "vllm"
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = [
+    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
+]
+
+[tool.codespell]
+ignore-words-list = "dout, te, indicies"
+skip = "./tests/prompts,./benchmarks/sonnet.txt"
+
+[tool.isort]
+use_parentheses = true
+skip_gitignore = true

From 7f7500b9de21a455691b90f0a951ac272fa6cbd6 Mon Sep 17 00:00:00 2001
From: Damian Szwichtenberg <damian.szwichtenberg@intel.com>
Date: Wed, 22 May 2024 13:40:22 +0200
Subject: [PATCH 016/341] Add high-level profiler (#29)

---
 vllm/worker/habana_model_runner.py |  56 ++++++++++---
 vllm/worker/profiler.py            | 121 +++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 11 deletions(-)
 create mode 100644 vllm/worker/profiler.py

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 995864e3f81e7..1451b6fe38aef 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -29,6 +29,8 @@
 from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available,
                         make_tensor_with_pad, format_bytes)
 
+from .profiler import Profiler
+
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = 0
@@ -156,6 +158,7 @@ def __init__(
         self.lora_config = lora_config
         self.load_config = load_config
         self.is_driver_worker = is_driver_worker
+        self.profiler = Profiler()
 
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)
@@ -696,16 +699,22 @@ def execute_model(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
+        event_start = self.profiler.get_timestamp_us()
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        base_event_name = 'prompt' if is_prompt else 'decode'
+        self.profiler.start('internal', base_event_name)
+
         if self.is_driver_worker:
-            is_prompt = seq_group_metadata_list[0].is_prompt
             real_batch_size = len(seq_group_metadata_list)
             bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg
-            batch_size_padding = find_bucket(real_batch_size, bucket_cfg) - real_batch_size
+            batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
+            batch_size_padding = batch_size_padded - real_batch_size
             seq_group_metadata_list = seq_group_metadata_list.copy()
             seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding))
-        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
-         ) = self.prepare_input_tensors(seq_group_metadata_list)
+        with self.profiler.record_event('internal', 'prepare_input_tensors'):
+            (input_tokens, input_positions, attn_metadata, sampling_metadata,
+            lora_requests, lora_mapping, multi_modal_input
+            ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
@@ -720,11 +729,13 @@ def execute_model(
             execute_model_kwargs.update({"image_input": multi_modal_input})
 
         htorch.core.mark_step()
-        hidden_states = self.model(**execute_model_kwargs)
+        with self.profiler.record_event('internal', f'model_{base_event_name}_eager_bs{real_batch_size}'):
+            hidden_states = self.model(**execute_model_kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
         # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        with self.profiler.record_event('internal', 'compute_logits'):
+            logits = self.model.compute_logits(hidden_states, sampling_metadata)
         htorch.core.mark_step()
 
         # Only perform sampling in the driver worker.
@@ -732,12 +743,30 @@ def execute_model(
             return None
 
         # Sample the next token.
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
-        )
+        with self.profiler.record_event('internal', 'sample'):
+            output = self.model.sample(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
         output.outputs = output.outputs[:real_batch_size]
         htorch.core.mark_step()
+
+        # Stop recording 'execute_model' event
+        self.profiler.end()
+
+        if self.profiler.enabled:
+            event_end = self.profiler.get_timestamp_us()
+            duration = event_end - event_start
+            throughput = batch_size_padded / (duration / 1e6)
+            throughput_effective = real_batch_size / (duration / 1e6)
+            counters = {
+                'batch_size': batch_size_padded,
+                'batch_size_effective': real_batch_size,
+                'throughput': throughput,
+                'throughput_effective': throughput_effective
+            }
+            self.profiler.record_counter(event_start, counters)
+
         return output
 
     def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
@@ -770,12 +799,16 @@ def profile_run(self) -> None:
         self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches)
 
     def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
+        scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}"
+        self.profiler.start('internal', scenario_name)
         seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)]
         _ = self.execute_model(seqs, kv_caches)
         torch.hpu.synchronize()
+        self.profiler.end()
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        self.profiler.start('internal', 'warmup')
         times = 1  # TODO: this is will be updated once HPU graphs are reintroduced
         scenarios = []
         scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False]))
@@ -792,6 +825,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         end_mem = HabanaMemoryProfiler.current_memory_usage()
         elapsed_time = end_time - start_time
         logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory")
+        self.profiler.end()
 
     @property
     def vocab_size(self) -> int:
diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
new file mode 100644
index 0000000000000..2d47f4349d45a
--- /dev/null
+++ b/vllm/worker/profiler.py
@@ -0,0 +1,121 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import json
+import os
+import queue
+import threading
+import time
+from contextlib import contextmanager
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class FileWriter(threading.Thread):
+
+    def __init__(self, filename, event_queue):
+        super().__init__()
+        self.filename = filename
+        self.event_queue = event_queue
+        self.daemon = True
+        self.timer_event = threading.Event()
+
+    def _drain_event_queue(self):
+        content = ''
+        while True:
+            try:
+                element = self.event_queue.get_nowait()
+                content += element
+            except queue.Empty:
+                break
+        return content
+
+    def run(self):
+        # don't check the queue too often
+        while not self.timer_event.wait(1):
+            # Block and wait for the next item in the queue
+            content = self.event_queue.get()
+            # Collect any other items in the queue
+            content += self._drain_event_queue()
+
+            with open(self.filename, 'a') as outfile:
+                outfile.write(content)
+
+
+class Profiler:
+    profiling_trace_events = queue.Queue()
+    event_tid = {'counter': 1, 'external': 2, 'internal': 3}
+    filename = 'server_events.json'
+    event_cache = []
+
+    def __init__(self):
+        self.enabled = os.getenv('VLLM_PROFILER_ENABLED',
+                                 'false').lower() == 'true' and int(
+                                     os.getenv('RANK', '0')) == 0
+        if self.enabled:
+            # initialize the trace file (JSON Array Format)
+            with open(self.filename, 'w') as outfile:
+                outfile.write('[')
+            file_writer = FileWriter(self.filename,
+                                     self.profiling_trace_events)
+            file_writer.start()
+
+    def _dump_with_sep(self, entry):
+        entry = json.dumps(entry) + ','
+        self.profiling_trace_events.put(entry)
+
+    def get_timestamp_us(self):
+        return time.time() * 1000000.0
+
+    def record_counter(self, ts, counter):
+        if self.enabled:
+            self._dump_with_sep({
+                'pid': 1,
+                'tid': self.event_tid['counter'],
+                'ph': 'C',
+                'name': 'utils',
+                'ts': ts,
+                'args': counter
+            })
+
+    def start(self, type, name, args=None):
+        if self.enabled:
+            ts = self.get_timestamp_us()
+            if args is not None and 'counter' in args:
+                self.record_counter(ts, args['counter'])
+                del args['counter']
+            event = {
+                'pid': 1,
+                'tid': self.event_tid[type],
+                'ph': 'X',
+                'name': name,
+                'ts': ts,
+                'dur': None,
+                'args': args
+            }
+            self.event_cache.append(event)
+
+    def end(self):
+        if self.enabled:
+            ts = self.get_timestamp_us()
+            if not self.event_cache:
+                logger.warning(
+                    'Profiler: end() call does not have matching start() call. Disabling profiler.'
+                )
+                self.enabled = False
+                return
+            event = self.event_cache.pop()
+            event['dur'] = ts - event['ts']
+            self._dump_with_sep(event)
+
+    @contextmanager
+    def record_event(self, type, name, args=None):
+        if self.enabled:
+            self.start(type, name, args)
+            yield
+            self.end()
+        else:
+            yield

From b6f5584f9da7b1bd61772ed6a41a64baed00079c Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 23 May 2024 13:40:16 +0200
Subject: [PATCH 017/341] Add release docs for Gaudi (#32)

* add gaudi installation readme

* readme writeup

* Create README_GAUDI.md

* Update README.md

* Update README_GAUDI.md

* Update README.md

* Update readmes
---
 README.md                                     |   3 +-
 README_GAUDI.md                               | 136 +++++++++++++++++
 .../getting_started/gaudi-installation.rst    | 144 ++++++++++++++++++
 docs/source/index.rst                         |   2 +
 4 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 README_GAUDI.md
 create mode 100644 docs/source/getting_started/gaudi-installation.rst

diff --git a/README.md b/README.md
index 524d027137aba..9b180877a5a82 100644
--- a/README.md
+++ b/README.md
@@ -10,11 +10,12 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+| <a href="README_GAUDI.md"><b>Intel® Gaudi® README</b></a> | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
 
 </p>
 
 *Latest News* 🔥
+- [2024/05] <b>vLLM-fork specific:</b> Added Intel® Gaudi® 2 support with SynapseAI 1.16.0. For more information, please refer to <a href="README_GAUDI.md"><b>Intel® Gaudi® README</b></a>.
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
diff --git a/README_GAUDI.md b/README_GAUDI.md
new file mode 100644
index 0000000000000..44e75e690950f
--- /dev/null
+++ b/README_GAUDI.md
@@ -0,0 +1,136 @@
+# vLLM with Intel® Gaudi® 2 AI Accelerators
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+Requirements and Installation
+==============================
+
+Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) 
+to set up the environment. To achieve the best performance, please follow the methods outlined in the
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). 
+
+> [!NOTE]
+> In this release (1.16.0), we are only targeting functionality and
+> accuracy. Performance will be improved in next releases.
+
+Requirements
+-------------
+
+-   OS: Ubuntu 22.04 LTS
+-   Python: 3.10
+-   Intel Gaudi 2 accelerator 
+-   Intel Gaudi software version 1.16.0
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+``` {.console}
+$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
+$ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed
+```
+
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details.
+
+Run Docker Image
+------------------
+
+It is highly recommended to use the latest Docker image from Intel
+Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
+
+Use the following commands to run a Docker image:
+
+``` {.console}
+$ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+ ```
+
+Build and Install vLLM-fork
+-----------------------------
+
+To build and install vLLM-fork from source, run:
+
+``` {.console}
+$ git clone https://github.com/HabanaAI/vllm-fork.git
+$ cd vllm-fork
+# git checkout 0.4.2-Gaudi-1.16.0
+$ pip install -e .  # This may take 5-10 minutes.
+```
+
+Supported Features
+==================
+
+-   [Offline batched inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
+-   Online inference via [OpenAI-Compatible Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+-   HPU autodetection - no need to manually select device within vLLM
+-   Paged KV cache with algorithms enabled for Intel Gaudi 2
+    accelerators
+-   Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+    prefill attention, Root Mean Square Layer Normalization, Rotary
+    Positional Encoding
+-   Tensor parallelism support for multi-card inference
+-   Inference with [HPU
+    Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+    for accelerating low-batch latency and throughput
+
+
+Unsupported Features
+====================
+
+-   Beam search
+-   LoRA adapters
+-   Attention with Linear Biases (ALiBi)
+-   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-   Prefill chunking (mixed-batch inferencing)
+
+
+Supported Configurations
+========================
+
+The following configurations have been validated to be function with Gaudi devices. Configurations that are not listed may or may not work.
+
+-   [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
+-   [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
+-   [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
+    with tensor parallelism on 8x HPU, BF16 datatype with random
+    or greedy sampling
+-   [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+    with tensor parallelism 8x HPU, BF16 datatype with random
+    or greedy sampling
+
+
+
+Performance Tips
+================
+
+-   We recommend running inference on Gaudi 2 with
+    `block_size` of 128 for BF16 data type. Using default
+    values (16, 32) might lead to sub-optimal performance due to Matrix
+    Multiplication Engine under-utilization (see [Gaudi
+    Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+-   For max throughput on Llama 7B, we recommend running with batch size
+    of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+    If you encounter out-of-memory issues, see troubleshooting section.
+
+Troubleshooting: Tweaking HPU Graphs
+====================================
+
+If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: 
+
+-  Tweak `gpu_memory_utilization` knob. It
+   will decrease the allocation of KV cache, leaving some headroom for
+   capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9.
+   It attempts to allocate \~90% of HBM left for KV cache after short
+   profiling run. Note that decreasing reduces the number of KV
+   cache blocks you have available, and therefore reduces the effective
+   maximum number of tokens you can handle at a given time.
+
+-  If this methon is not efficient, you can disable `HPUGraph` completely. With
+   HPU Graphs disabled, you are trading latency and throughput at lower
+   batches for potentially higher throughput on higher batches. You can do
+   that by adding `--enforce-eager` flag to server (for
+   online inference), or by passing `enforce_eager=True`
+   argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
new file mode 100644
index 0000000000000..cd026df8bf057
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -0,0 +1,144 @@
+vLLM with Intel® Gaudi® 2 AI Accelerators
+=========================================
+
+This README provides instructions on running vLLM with Intel Gaudi
+devices.
+
+Requirements and Installation
+=============================
+
+Please follow the instructions provided in the `Gaudi Installation
+Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
+to set up the environment. To achieve the best performance, please
+follow the methods outlined in the `Optimizing Training Platform
+Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
+
+.. note:: 
+   In this release (1.16.0), we are only targeting functionality
+   and accuracy. Performance will be improved in next releases.
+
+Requirements
+------------
+
+-  OS: Ubuntu 22.04 LTS
+-  Python: 3.10
+-  Intel Gaudi 2 accelerator
+-  Intel Gaudi software version 1.16.0
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+.. code:: console
+
+   $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
+   $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed
+
+Refer to `Intel Gaudi Software Stack
+Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
+for more details.
+
+Run Docker Image
+----------------
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the `Intel Gaudi
+documentation <https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers>`__
+for more details.
+
+Use the following commands to run a Docker image:
+
+.. code:: console
+
+   $ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+
+Build and Install vLLM-fork
+---------------------------
+
+To build and install vLLM-fork from source, run:
+
+.. code:: console
+
+   $ git clone https://github.com/HabanaAI/vllm-fork.git
+   $ cd vllm-fork
+   # git checkout 0.4.2-Gaudi-1.16.0
+   $ pip install -e .  # This may take 5-10 minutes.
+
+Supported Features
+==================
+
+-  `Offline batched
+   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
+-  Online inference via `OpenAI-Compatible
+   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
+-  HPU autodetection - no need to manually select device within vLLM
+-  Paged KV cache with algorithms enabled for Intel Gaudi 2 accelerators
+-  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+   prefill attention, Root Mean Square Layer Normalization, Rotary
+   Positional Encoding
+-  Tensor parallelism support for multi-card inference
+-  Inference with `HPU
+   Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
+   for accelerating low-batch latency and throughput
+
+Unsupported Features
+====================
+
+-  Beam search
+-  LoRA adapters
+-  Attention with Linear Biases (ALiBi)
+-  Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-  Prefill chunking (mixed-batch inferencing)
+
+Supported Configurations
+========================
+
+The following configurations have been validated to be function with
+Gaudi devices. Configurations that are not listed may or may not work.
+
+-  `meta-llama/Llama-2-7b <https://huggingface.co/meta-llama/Llama-2-7b>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Llama-2-7b-chat-hf <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or
+   greedy sampling
+-  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
+   with tensor parallelism 8x HPU, BF16 datatype with random or greedy
+   sampling
+
+Performance Tips
+================
+
+-  We recommend running inference on Gaudi 2 with ``block_size`` of 128
+   for BF16 data type. Using default values (16, 32) might lead to
+   sub-optimal performance due to Matrix Multiplication Engine
+   under-utilization (see `Gaudi
+   Architecture <https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html>`__).
+-  For max throughput on Llama 7B, we recommend running with batch size
+   of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+   If you encounter out-of-memory issues, see troubleshooting section.
+
+Troubleshooting: Tweaking HPU Graphs
+====================================
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+-  Tweak ``gpu_memory_utilization`` knob. It will decrease the
+   allocation of KV cache, leaving some headroom for capturing graphs
+   with larger batch size. By default ``gpu_memory_utilization`` is set
+   to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+   short profiling run. Note that decreasing reduces the number of KV
+   cache blocks you have available, and therefore reduces the effective
+   maximum number of tokens you can handle at a given time.
+
+-  If this methon is not efficient, you can disable ``HPUGraph``
+   completely. With HPU Graphs disabled, you are trading latency and
+   throughput at lower batches for potentially higher throughput on
+   higher batches. You can do that by adding ``--enforce-eager`` flag to
+   server (for online inference), or by passing ``enforce_eager=True``
+   argument to LLM constructor (for offline inference).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4022c590843e6..aeb3b60ccb1ad 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,7 @@ vLLM is flexible and easy to use with:
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
+* (Experimental) Support for Intel® Gaudi® 2 accelerators
 * (Experimental) Prefix caching support
 * (Experimental) Multi-lora support
 
@@ -64,6 +65,7 @@ Documentation
    getting_started/amd-installation
    getting_started/neuron-installation
    getting_started/cpu-installation
+   getting_started/gaudi-installation
    getting_started/quickstart
    getting_started/examples/examples_index
 

From 6f5629fda71eb40bf761f204a5ca2837e341cd0b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 23 May 2024 14:00:28 +0200
Subject: [PATCH 018/341] Update tag in readme (#39)

---
 README_GAUDI.md                                    | 2 +-
 docs/source/getting_started/gaudi-installation.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 44e75e690950f..24d3fe0761f54 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -52,7 +52,7 @@ To build and install vLLM-fork from source, run:
 ``` {.console}
 $ git clone https://github.com/HabanaAI/vllm-fork.git
 $ cd vllm-fork
-# git checkout 0.4.2-Gaudi-1.16.0
+# git checkout v0.4.2-Gaudi-1.16.0
 $ pip install -e .  # This may take 5-10 minutes.
 ```
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index cd026df8bf057..90f97155e1d75 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -61,7 +61,7 @@ To build and install vLLM-fork from source, run:
 
    $ git clone https://github.com/HabanaAI/vllm-fork.git
    $ cd vllm-fork
-   # git checkout 0.4.2-Gaudi-1.16.0
+   # git checkout v0.4.2-Gaudi-1.16.0
    $ pip install -e .  # This may take 5-10 minutes.
 
 Supported Features

From 3c827b311ee7d0cf3ceae49a42dbb8d830e153ce Mon Sep 17 00:00:00 2001
From: Damian Szwichtenberg <damian.szwichtenberg@intel.com>
Date: Thu, 23 May 2024 15:20:53 +0200
Subject: [PATCH 019/341] Fix error with high-level profiler in multi-card
 scenario (#38)

---
 vllm/worker/habana_model_runner.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 1451b6fe38aef..2bec899831c49 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -699,12 +699,13 @@ def execute_model(
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
-        event_start = self.profiler.get_timestamp_us()
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        base_event_name = 'prompt' if is_prompt else 'decode'
-        self.profiler.start('internal', base_event_name)
-
         if self.is_driver_worker:
+            # profiler is enabled only for rank == 0 (profiler.py:L57)
+            event_start = self.profiler.get_timestamp_us()
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            base_event_name = 'prompt' if is_prompt else 'decode'
+            self.profiler.start('internal', base_event_name)
+
             real_batch_size = len(seq_group_metadata_list)
             bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg
             batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
@@ -729,7 +730,11 @@ def execute_model(
             execute_model_kwargs.update({"image_input": multi_modal_input})
 
         htorch.core.mark_step()
-        with self.profiler.record_event('internal', f'model_{base_event_name}_eager_bs{real_batch_size}'):
+        if self.is_driver_worker:
+            model_event_name = f'model_{base_event_name}_eager_bs{real_batch_size}'
+        else:
+            model_event_name = 'model_executable'
+        with self.profiler.record_event('internal', model_event_name):
             hidden_states = self.model(**execute_model_kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
@@ -751,10 +756,9 @@ def execute_model(
         output.outputs = output.outputs[:real_batch_size]
         htorch.core.mark_step()
 
-        # Stop recording 'execute_model' event
-        self.profiler.end()
-
-        if self.profiler.enabled:
+        if self.is_driver_worker:
+            # Stop recording 'execute_model' event
+            self.profiler.end()
             event_end = self.profiler.get_timestamp_us()
             duration = event_end - event_start
             throughput = batch_size_padded / (duration / 1e6)

From af0f1a691ef5c20f34d40359bf462924522da6d0 Mon Sep 17 00:00:00 2001
From: jkaniecki <153085639+jkaniecki@users.noreply.github.com>
Date: Tue, 28 May 2024 13:00:38 +0200
Subject: [PATCH 020/341] Static fused moe op (#41)

* Fix mixtral hidden states layout to fit into habana model runner

* Add static moe op to mixtral

* Add mark_step to static_fused_moe

* Update __init__.py

* Fix code indentation

* Make code compatible with non HPU devices

* Move static_fused_moe to vllm.hpu.ops

* Update mixtral.py

* Move op import from forward to top of the file

* Remove circular import
---
 vllm/hpu/ops.py                       | 36 +++++++++++++++++++++
 vllm/model_executor/models/mixtral.py | 45 ++++++++++++++++++---------
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index d95b301697cea..25bccb43297d5 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -113,3 +113,39 @@ def apply_rope(
 
 def awq_gemm(*args):
     raise NotImplementedError
+
+
+def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    silu_and_mul(out, x)
+    return out
+
+
+@hpu_utils.with_mark_steps
+def static_fused_moe(hidden_states, w1, w2, score, topk):
+    B, D = hidden_states.shape
+    num_experts = w1.shape[0]
+    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
+    routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+    routing_weights = routing_weights.to(hidden_states.dtype)
+    final_hidden_states = torch.zeros(
+            (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    padded_weights = torch.zeros(
+            (B, num_experts), dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    padded_weights.scatter_(-1, selected_experts, routing_weights)
+    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
+    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+
+    for expert_idx in range(num_experts):
+        padded_weight = padded_weights[expert_idx]
+        current_state_static = hidden_states.reshape(-1, D)
+        w_output = silu_and_mul_wrapper(torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
+        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
+        current_hidden_states_static = w_output * padded_weight
+        final_hidden_states += current_hidden_states_static
+
+    return final_hidden_states.view(-1, D)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index efa4de7516212..4b602203cee79 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -50,7 +50,10 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
-from vllm.utils import print_warning_once
+from vllm.utils import print_warning_once, is_hpu
+
+if is_hpu():
+    from vllm.hpu.ops import static_fused_moe
 
 
 class MixtralMoE(nn.Module):
@@ -220,28 +223,40 @@ def process_weights_after_loading(self):
                                          requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_size = hidden_states.shape
+        if is_hpu():
+            batch_size, sequence_length, hidden_size = hidden_states.shape
+        else:
+            num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w13_weight,
-                                        self.w2_weight,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=True,
-                                        inplace=True,
-                                        use_fp8=self.use_fp8,
-                                        w1_scale=self.w13_scale,
-                                        w2_scale=self.w2_scale,
-                                        a1_scale=self.a13_scale,
-                                        a2_scale=self.a2_scale)
+
+        if is_hpu():
+            final_hidden_states = static_fused_moe(hidden_states,
+                        self.w13_weight,
+                        self.w2_weight,
+                        router_logits,
+                        self.top_k)
+        else:
+            final_hidden_states = fused_moe(hidden_states,
+                                            self.w13_weight,
+                                            self.w2_weight,
+                                            router_logits,
+                                            self.top_k,
+                                            renormalize=True,
+                                            inplace=True,
+                                            use_fp8=self.use_fp8,
+                                            w1_scale=self.w13_scale,
+                                            w2_scale=self.w2_scale,
+                                            a1_scale=self.a13_scale,
+                                            a2_scale=self.a2_scale)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
-        return final_hidden_states.view(num_tokens, hidden_size)
+        return (final_hidden_states.view(batch_size, sequence_length, hidden_size) if is_hpu() 
+                else final_hidden_states.view(num_tokens, hidden_size))
 
 
 class MixtralAttention(nn.Module):

From 8359489977af675b464773b6462059632e589cce Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 3 Jun 2024 11:55:45 +0200
Subject: [PATCH 021/341] WA: Remove pyproject.toml, bypass HPU autodetection
 (#45)

---
 pyproject.toml | 67 --------------------------------------------------
 setup.py       |  1 +
 2 files changed, 1 insertion(+), 67 deletions(-)
 delete mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 6a448defc16e1..0000000000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,67 +0,0 @@
-[build-system]
-# Should be mirrored in requirements-build.txt
-requires = [
-    "cmake>=3.21",
-    "ninja",
-    "packaging",
-    "setuptools >= 49.4.0",
-    "torch == 2.3.0",
-    "wheel",
-]
-build-backend = "setuptools.build_meta"
-
-[tool.ruff]
-# Allow lines to be as long as 80.
-line-length = 80
-exclude = [
-    # External file, leaving license intact
-    "examples/fp8/quantizer/quantize.py"
-]
-
-[tool.ruff.lint]
-select = [
-    # pycodestyle
-    "E",
-    # Pyflakes
-    "F",
-    # pyupgrade
-    # "UP",
-    # flake8-bugbear
-    "B",
-    # flake8-simplify
-    "SIM",
-    # isort
-    # "I",
-    "G",
-]
-ignore = [
-    # star imports
-    "F405", "F403",
-    # lambda expression assignment
-    "E731",
-    # Loop control variable not used within loop body
-    "B007",
-]
-
-[tool.mypy]
-python_version = "3.8"
-
-ignore_missing_imports = true
-check_untyped_defs = true
-follow_imports = "skip"
-
-files = "vllm"
-# TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = [
-    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
-    # Ignore triton kernels in ops.
-    'vllm/attention/ops/.*\.py$'
-]
-
-[tool.codespell]
-ignore-words-list = "dout, te, indicies"
-skip = "./tests/prompts,./benchmarks/sonnet.txt"
-
-[tool.isort]
-use_parentheses = true
-skip_gitignore = true
diff --git a/setup.py b/setup.py
index 49e20aac0068a..964c467fd0a3f 100644
--- a/setup.py
+++ b/setup.py
@@ -204,6 +204,7 @@ def build_extensions(self) -> None:
 
 def _is_hpu() -> bool:
     is_hpu_available = True
+    return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it.
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):

From 82f6280b896db33d9ea2e081f303de5b8aae644a Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Mon, 3 Jun 2024 14:49:16 +0200
Subject: [PATCH 022/341] Use setuptools older than 70.0.0 (#42)

* Use setuptools older than 70.0.0

* Delete pyproject.toml

---------

Co-authored-by: Konrad Zawora <kzawora@habana.ai>
---
 requirements-build.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 1a07a94e82e04..a944fa31fd74d 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,7 +1,7 @@
-# Should be mirrored in pyproject.toml
-cmake>=3.21
-ninja
-packaging
-setuptools>=49.4.0
-torch==2.3.0
-wheel
+# Should be mirrored in pyproject.toml
+cmake>=3.21
+ninja
+packaging
+setuptools>=49.4.0,<70.0.0
+torch==2.3.0
+wheel

From 539e394085b722f0860d3174eb0f265f63722c9a Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Mon, 3 Jun 2024 14:50:08 +0200
Subject: [PATCH 023/341] Add VLLM_SKIP_WARMUP flag (#43)

---
 vllm/worker/habana_model_runner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 2bec899831c49..e7d1c41214369 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -812,6 +812,9 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true':
+            logger.info("Skipping warmup...")
+            return
         self.profiler.start('internal', 'warmup')
         times = 1  # TODO: this is will be updated once HPU graphs are reintroduced
         scenarios = []

From b3617eef86c506c66de8faeb7b0c25a8a8850abc Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Tue, 4 Jun 2024 13:02:02 +0200
Subject: [PATCH 024/341] Graphs v2 (#44)

* Trimmed metadata - part 1

* [WIP] HPU graphs for decode

* [WIP] Graph allocation algorithm reworked

* Cleanup

* Add graph memory estimations

* Fix multinode synchronization

* Create attn_bias inside HPU graph

* Cleanup after rebase

* Increase default VLLM_GRAPH_RESERVED_MEM to 0.3

* Remove obsolete class

* Tweak default HPU graph parameters
---
 vllm/attention/backends/habana_attn.py        |  26 +-
 vllm/attention/ops/habana_paged_attn.py       |   4 -
 vllm/hpu/ops.py                               |   2 +-
 .../model_executor/layers/logits_processor.py |   7 +-
 vllm/utils.py                                 |   7 +-
 vllm/worker/habana_model_runner.py            | 227 +++++++++++++++---
 vllm/worker/habana_worker.py                  |  23 +-
 7 files changed, 216 insertions(+), 80 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 45fe1989f9bff..017cf9c8933e5 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -84,9 +84,6 @@ class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMet
 
     # Maximum query length in the batch.
     max_query_len: Optional[int]
-    # FIXME: It is for flash attn.
-    # Maximum sequence length in the batch.
-    max_seq_len: Optional[int]
     # (batch_size + 1,). The cumulative subquery lengths of the sequences in
     # the batch, used to index into subquery. E.g., if the subquery length
     # is [4, 6], it is [0, 4, 10].
@@ -201,27 +198,7 @@ def forward(
             # Prompt run.
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # TODO: move this outside of model
-                if prefill_meta.attn_bias is None:
-                    if self.alibi_slopes is None:
-                        lens = torch.tensor(attn_metadata.prefill_metadata.seq_lens, device=query.device, dtype=torch.int32)
-                        len_mask = (torch.arange(0, seq_len, device=query.device, dtype=torch.int32)
-                                    .view(1, seq_len)
-                                    .ge(lens.unsqueeze(-1))
-                                    .view(batch_size, 1, 1, seq_len))
-                        causal_mask = torch.triu(
-                            torch.ones((batch_size, 1, seq_len, seq_len), device=query.device, dtype=torch.bool),
-                            diagonal=1
-                        )
-                        mask = causal_mask.logical_or(len_mask)
-                        attn_bias = (torch.zeros_like(mask, dtype=query.dtype)
-                                     .masked_fill_(mask, -math.inf))
-                        if self.sliding_window is not None:
-                            raise NotImplementedError("Sliding window is not supported on HPU")
-                        prefill_meta.attn_bias = attn_bias
-                    else:
-                        prefill_meta.attn_bias = _make_alibi_bias(
-                            self.alibi_slopes, self.num_kv_heads, batch_size,
-                            seq_len, query.dtype)
+                assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
                 query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
                 kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
                 out = xops.prompt_attention(
@@ -256,7 +233,6 @@ def forward(
                 value_cache,
                 decode_meta.block_tables,
                 decode_meta.seq_lens_tensor,
-                decode_meta.max_seq_len,
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index 8dc79f17f8c9c..bd6a58684f567 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -19,8 +19,6 @@ class HabanaPagedAttentionMetadata:
     # (batch_size,). The length of sequences (entire tokens seen so far) per
     # sequence.
     seq_lens_tensor: Optional[torch.Tensor]
-    # Maximum sequence length in the batch.
-    max_seq_len: Optional[int]
     # (batch_size, max_blocks_per_seq).
     # Block addresses per sequence. (Seq id -> list of physical block)
     # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
@@ -82,7 +80,6 @@ def forward_decode(
         value_cache: torch.Tensor,
         block_tables: torch.Tensor,
         seq_lens: torch.Tensor,
-        max_seq_len: int,
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
@@ -99,7 +96,6 @@ def forward_decode(
             block_tables,
             seq_lens,
             block_size,
-            max_seq_len,
             alibi_slopes,
             kv_cache_dtype,
         )
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 25bccb43297d5..10e53312378ad 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -36,7 +36,7 @@ def fetch_from_cache(cache, blocks):
 
 
 @hpu_utils.with_mark_steps
-def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, kv_cache_dtype=None)  -> None:
+def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None:
     seq_len = block_tables.size(1)
     batch_size, query_heads, _ = query.shape
     _, kv_heads, _, _ = key_cache.shape
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 5e484ff05b2f3..3951619c6e3ec 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -85,8 +85,11 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    return hidden_states.index_select(0,
-                                      sampling_metadata.selected_token_indices)
+    if sampling_metadata.selected_token_indices is not None:
+        return hidden_states.index_select(0,
+                                          sampling_metadata.selected_token_indices)
+    else:
+        return hidden_states
 
 
 def _apply_logits_processors(
diff --git a/vllm/utils.py b/vllm/utils.py
index a8cefefecb8e5..456c5602cf9d3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -512,16 +512,17 @@ def total_memory() -> float:
         return total_hpu_memory
 
     def __enter__(self):
+        # Force garbage collection
+        gc.collect()
         self.initial_memory = HabanaMemoryProfiler.current_memory_usage()
         # This allows us to call methods of the context manager if needed
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.final_memory = HabanaMemoryProfiler.current_memory_usage()
-        self.consumed_memory = self.final_memory - self.initial_memory
-
         # Force garbage collection
         gc.collect()
+        self.final_memory = HabanaMemoryProfiler.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
 
 
 # Adapted from https://stackoverflow.com/a/49361727
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e7d1c41214369..9b3511a328c5e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -6,6 +6,8 @@
 from enum import IntEnum
 from typing import List, NamedTuple, Optional, Set, Tuple, Dict
 
+import collections
+import gc
 import os
 import math
 import itertools
@@ -18,6 +20,7 @@
 from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
+from vllm.distributed.parallel_state import get_cpu_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -35,19 +38,21 @@
 
 _PAD_SLOT_ID = 0
 LORA_WARMUP_RANK = 8
+_TYPE_CACHE = {}
 
 
 # Read bucketing configuration from env variables
 # phase is either 'prompt' or 'decode'
 # dim is either 'bs' or 'seq'
-# example env variable: VLLM_DECODE_BS_STEP=128
+# param is either 'min', 'step' or 'max'
+# example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
 def read_bucket_settings(phase: str, dim: str, **defaults: Dict):
     params = ['min', 'step', 'max']
-    values = [os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p]) for p in params]
+    values = [int(os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p])) for p in params]
     return values
 
 
-def warmup_buckets(config: Tuple[int, int, int]):
+def warmup_range(config: Tuple[int, int, int]):
     bmin, bstep, bmax = config
     base = itertools.repeat(2)
     ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin)
@@ -56,6 +61,11 @@ def warmup_buckets(config: Tuple[int, int, int]):
     return list(ramp_up) + list(stable)
 
 
+def warmup_buckets(bs_bucket_config, seq_bucket_config):
+    buckets = itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config))
+    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+
+
 def next_pow2(value: int):
     res = 1
     while value > 1:
@@ -77,6 +87,78 @@ def find_bucket(value: int, config: Tuple[int, int, int]):
     return result
 
 
+def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[str, object] = {}):
+    if obj is None:
+        return None
+    fields = set(to_copy) | set(to_override.keys())
+    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if typename not in _TYPE_CACHE:
+        _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields))
+    return _TYPE_CACHE[typename](**values)
+
+
+def align_workers(value, op):
+    group = get_cpu_world_group()
+    world_size = torch.distributed.get_world_size()
+    if world_size <= 1:
+        return value
+    value_t = torch.tensor(value, device='cpu')
+    torch.distributed.all_reduce(value_t, op=op, group=group)
+    return value_t.item()
+
+
+class HpuModelAdapter():
+    def __init__(self, model):
+        self.model = model
+
+    def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
+        prefill_metadata = attn_metadata.prefill_metadata
+        if prefill_metadata is None:
+            return attn_metadata
+        #FIXME: Restore alibi support
+        #if self.alibi_slopes is None:
+        if True:
+            seq_lens_t = prefill_metadata.seq_lens_tensor
+            len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32)
+                        .view(1, seq_len)
+                        .ge(seq_lens_t.unsqueeze(-1))
+                        .view(batch_size, 1, 1, seq_len))
+            causal_mask = torch.triu(
+                torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool),
+                diagonal=1
+            )
+            mask = causal_mask.logical_or(len_mask)
+            attn_bias = (torch.zeros_like(mask, dtype=dtype)
+                         .masked_fill_(mask, -math.inf))
+            #FIXME: Restore sliding window support
+            #if self.sliding_window is not None:
+            prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
+            attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
+            return attn_metadata
+        else:
+            # FIXME: This needs updating...
+            prefill_meta.attn_bias = _make_alibi_bias(
+                self.alibi_slopes, self.num_kv_heads, batch_size,
+                seq_len, query.dtype)
+
+
+    def forward(self, *args, **kwargs):
+        kwargs = kwargs.copy()
+        selected_token_indices = kwargs.pop('selected_token_indices')
+        input_ids = kwargs['input_ids']
+        kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16)
+        hidden_states = self.model(*args, **kwargs)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = hidden_states.index_select(0, selected_token_indices)
+        return hidden_states
+
+    def compute_logits(self, *args, **kwargs):
+        return self.model.compute_logits(*args, **kwargs)
+
+    def sample(self, *args, **kwargs):
+        return self.model.sample(*args, **kwargs)
+
+
 class PreparePromptMetadata(NamedTuple):
     input_tokens: List[int]
     input_positions: List[int]
@@ -164,8 +246,9 @@ def __init__(
                                if model_config is not None else None)
         self.device_config = (device_config
                               if device_config is not None else DeviceConfig())
-        self.device = self.device_config.device
 
+        self.device = self.device_config.device
+        self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
         self.max_model_len = self.scheduler_config.max_model_len
         self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
@@ -181,7 +264,6 @@ def __init__(
         # Lazy initialization
         self.lora_manager: LRUCacheWorkerLoRAManager = None
         self.model: torch.nn.Module = None
-        self.excluded_from_warmup = []
 
         self._setup_buckets()
 
@@ -196,6 +278,8 @@ def load_model(self) -> None:
                 parallel_config=self.parallel_config,
                 scheduler_config=self.scheduler_config,
             )
+            # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
+            self.model = htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(self.model))
 
         self.model_memory_usage = m.consumed_memory
         logger.info(f"Loading model weights took "
@@ -217,18 +301,25 @@ def load_model(self) -> None:
                 self.model.embedding_padding_modules)
             self.model = self.lora_manager.create_lora_manager(self.model)
 
+    def _use_graphs(self, batch_size, seq_len, is_prompt):
+        if self.enforce_eager:
+            return False
+        return (batch_size, seq_len, is_prompt) in self.graphed_buckets
+
     def _setup_buckets(self) -> None:
         self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64))
         self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs)
         self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024)
         self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048)
+        self.graphed_buckets = set()
+
         logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}")
-        logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}")
+        self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg)
+        logger.info(f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}")
 
-        # FIXME: exclude from warmup as it causes OOM on llama-70b
-        self.excluded_from_warmup = [
-            (64, 1024, True)
-        ]
+        logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}")
+        self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg)
+        logger.info(f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}")
 
     def _prepare_prompt(
         self,
@@ -350,7 +441,6 @@ def _prepare_prompt(
                 slot_mapping[-1].append(slot)
 
         max_query_len = max(query_lens)
-        max_seq_len = max(seq_lens)
         assert max_query_len > 0
 
         context_lens_tensor = torch.tensor(context_lens,
@@ -413,7 +503,6 @@ def _prepare_prompt(
             seq_lens=seq_lens,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=max_query_len,
-            max_seq_len=max_seq_len,
             subquery_start_loc=subquery_start_loc,
             seq_start_loc=seq_start_loc,
             context_lens_tensor=context_lens_tensor,
@@ -486,7 +575,6 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        max_seq_len = max(seq_lens)
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -514,7 +602,6 @@ def _prepare_decode(
             seq_lens=None,
             seq_lens_tensor=seq_lens_tensor,
             max_query_len=None,
-            max_seq_len=max_seq_len,
             subquery_start_loc=None,
             seq_start_loc=None,
             context_lens_tensor=None,
@@ -693,6 +780,30 @@ def prepare_input_tensors(
                 sampling_metadata, lora_requests, lora_mapping,
                 multi_modal_input)
 
+    def _seq_len(self, attn_metadata):
+        if attn_metadata.prefill_metadata:
+            return attn_metadata.slot_mapping.size(1)
+        else:
+            return attn_metadata.decode_metadata.block_tables.size(1) * self.block_size
+
+    def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
+        prefill_metadata = subtuple(metadata.prefill_metadata,
+                                    'TrimmedPrefillMetadata',
+                                    ['block_tables',
+                                     'seq_lens_tensor',
+                                     'attn_bias'])
+        decode_metadata = subtuple(metadata.decode_metadata,
+                                   'TrimmedDecodeMetadata',
+                                   ['block_tables',
+                                    'seq_lens_tensor',
+                                    ])
+        return subtuple(metadata,
+                        'TrimmedMetadata',
+                        ['slot_mapping',
+                         'kv_cache_dtype'],
+                        {'prefill_metadata': prefill_metadata,
+                         'decode_metadata': decode_metadata})
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -700,7 +811,6 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         if self.is_driver_worker:
-            # profiler is enabled only for rank == 0 (profiler.py:L57)
             event_start = self.profiler.get_timestamp_us()
             is_prompt = seq_group_metadata_list[0].is_prompt
             base_event_name = 'prompt' if is_prompt else 'decode'
@@ -716,15 +826,19 @@ def execute_model(
             (input_tokens, input_positions, attn_metadata, sampling_metadata,
             lora_requests, lora_mapping, multi_modal_input
             ) = self.prepare_input_tensors(seq_group_metadata_list)
+            is_prompt = attn_metadata.prefill_metadata is not None
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)
 
+        batch_size = input_tokens.size(0)
+        seq_len = self._seq_len(attn_metadata)
+        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
         execute_model_kwargs = {
             "input_ids": input_tokens,
             "positions": input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": self.trim_attn_metadata(attn_metadata),
         }
         if self.vision_language_config:
             execute_model_kwargs.update({"image_input": multi_modal_input})
@@ -735,11 +849,11 @@ def execute_model(
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model(**execute_model_kwargs)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+            hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs)
 
         # Compute the logits.
         with self.profiler.record_event('internal', 'compute_logits'):
+            sampling_metadata.selected_token_indices = None
             logits = self.model.compute_logits(hidden_states, sampling_metadata)
         htorch.core.mark_step()
 
@@ -803,31 +917,84 @@ def profile_run(self) -> None:
         self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches)
 
     def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
-        scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}"
+        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}"
         self.profiler.start('internal', scenario_name)
+        times = 3 if use_graphs else 1
         seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)]
-        _ = self.execute_model(seqs, kv_caches)
         torch.hpu.synchronize()
+        for _ in range(times):
+            self.execute_model(seqs, kv_caches)
+            torch.hpu.synchronize()
         self.profiler.end()
+        gc.collect()
+
+    def log_warmup(self, phase, i, max_i, batch_size, seq_len):
+        free_mem = format_bytes(HabanaMemoryProfiler.current_free_memory())
+        logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}")
+
+    def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
+        for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
+            mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory()
+            self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len)
+            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
 
+    def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
+        total_batch_seq = 0.001
+        total_mem = 0
+        idx = 0
+        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
+        num_candidates = len(buckets)
+
+        if strategy == 'min_tokens':
+            ordering = lambda b: (b[0] * b[1], b[1], b[0])
+        elif strategy == 'max_bs':
+            ordering = lambda b: (-b[0], b[1])
+        else:
+            raise NotImplementedError(f'Unsupported graph allocation strategy: {strategy}')
+        buckets = list(sorted(buckets, key=ordering))
+
+        for idx, (batch_size, seq_len) in enumerate(buckets):
+            # Graph memory usage is proportional to seq dimension in a batch
+            batch_seq = batch_size * seq_len if is_prompt else batch_size
+            mem_estimate = batch_seq / total_batch_seq * total_mem
+            if mem_estimate >= available_mem:
+                continue
+            self.graphed_buckets.add((batch_size, seq_len, is_prompt))
+            self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
+            with HabanaMemoryProfiler() as mem_prof:
+                self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+            used_mem = align_workers(mem_prof.consumed_memory, torch.distributed.ReduceOp.MAX)
+            available_mem -= used_mem
+            total_mem += used_mem
+            total_batch_seq += batch_seq
+        graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt)
+        logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}')
+        
+        
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true':
             logger.info("Skipping warmup...")
             return
         self.profiler.start('internal', 'warmup')
-        times = 1  # TODO: this is will be updated once HPU graphs are reintroduced
-        scenarios = []
-        scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False]))
-        scenarios.extend(itertools.product(warmup_buckets(self.prompt_bs_bucket_cfg), warmup_buckets(self.prompt_seq_bucket_cfg), [True]))
-        scenarios = [scenario for scenario in reversed(scenarios) for _ in range(times) if scenario not in self.excluded_from_warmup]
-
         start_mem = HabanaMemoryProfiler.current_memory_usage()
         start_time = time.perf_counter()
-        for i, (batch_size, seq_len, is_prompt) in enumerate(scenarios):
-            mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory()
-            logger.info(f"[Warmup][{i+1}/{len(scenarios)}] batch_size:{batch_size} seq_len:{seq_len} is_prompt:{is_prompt} mem_usage:{mem_usage:0.1f}%")
-            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+        self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
+        self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
+
+        if not self.enforce_eager:
+            mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
+            free_mem = mem_margin * HabanaMemoryProfiler.current_free_memory()
+            free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN)
+            prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
+            prompt_available_memory = prompt_graph_mem_ratio * free_mem
+            decode_available_memory = free_mem - prompt_available_memory
+            prompt_strategy = 'min_tokens'
+            decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', 'max_bs')
+            self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, kv_caches, prompt_available_memory)
+            self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory)
+
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_memory_usage()
         elapsed_time = end_time - start_time
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index eeba9e5c4adba..e253e4479a855 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -90,7 +90,6 @@ def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
             self.device = torch.device("hpu")
             torch.hpu.set_device(self.device)
-            self.init_hpu_memory = torch.hpu.mem_get_info()[0]
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -123,22 +122,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
         torch.hpu.synchronize()
-        free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # HPU did not change their memory usage during the profiling.
-        peak_memory = self.init_hpu_memory - free_hpu_memory
-        assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the HPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
+
+        # At this point we should've allocated the maximum workspace for all recipes
+        # we will use the extra memory for graphs/blocks
+        free_hpu_memory = torch.hpu.mem_get_info()[0]
 
         cache_block_size = self.get_cache_block_size_bytes()
-        num_hpu_blocks = int(
-            (total_hpu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
+        graph_headroom = 1 - (float(os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) if not self.model_config.enforce_eager else 0)
+        num_hpu_blocks = int(free_hpu_memory * graph_headroom * self.cache_config.gpu_memory_utilization // cache_block_size)
         num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                              cache_block_size)
         num_hpu_blocks = max(num_hpu_blocks, 0)
@@ -298,7 +290,8 @@ def init_worker_distributed_environment(
     assert dummy_tensor_hpu.item() == parallel_config.world_size
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
-    
+
+
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
                                 max_model_len) -> None:
     if num_gpu_blocks <= 0:

From 1c5d12e53d19155f5f8f1633c9a758591958477e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 5 Jun 2024 09:57:46 +0200
Subject: [PATCH 025/341] Remove usage of wrap_in_hpu_graph in PT eager (#47)

---
 vllm/worker/habana_model_runner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 9b3511a328c5e..5b9dff97d75e9 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -145,6 +145,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
         selected_token_indices = kwargs.pop('selected_token_indices')
+        if 'bypass_hpu_graphs' in kwargs:
+            kwargs.pop('bypass_hpu_graphs') # required for PT eager
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16)
         hidden_states = self.model(*args, **kwargs)
@@ -279,7 +281,7 @@ def load_model(self) -> None:
                 scheduler_config=self.scheduler_config,
             )
             # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
-            self.model = htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(self.model))
+            self.model = _maybe_wrap_in_hpu_graph(self.model)
 
         self.model_memory_usage = m.consumed_memory
         logger.info(f"Loading model weights took "
@@ -1004,3 +1006,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
+
+def _maybe_wrap_in_hpu_graph(model):
+    return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)

From 9bb5d20ca215b90eb80d7813ffe68b0cf4e41cfb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 5 Jun 2024 12:01:09 +0200
Subject: [PATCH 026/341] Add HPU support to benchmark_latency and
 benchmark_throughput (#49)

---
 benchmarks/benchmark_latency.py    | 4 ++--
 benchmarks/benchmark_throughput.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 44da3bad8d840..e8530c2761acf 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -170,8 +170,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        choices=["cuda", "cpu", "hpu"],
+        help='device type for vLLM execution, supporting CUDA, CPU and HPU.')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 695d06e7b243d..2e8cfd3f2ca3e 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -334,8 +334,8 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        choices=["cuda", "cpu", "hpu"],
+        help='device type for vLLM execution, supporting CUDA, CPU and HPU.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',

From ab359aca159ecf01a9dbcd85074eb4613e218f57 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 5 Jun 2024 14:04:24 +0200
Subject: [PATCH 027/341] Use int32 seeds for random sampler on HPU (#50)

---
 vllm/model_executor/sampling_metadata.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 9969c45963e9a..4b722aba567a4 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -8,7 +8,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
 from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        maybe_expand_dim)
+                        maybe_expand_dim, is_hpu)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
@@ -498,22 +498,23 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             dtype=torch.int,
             pin_memory=pin_memory,
         )
+        idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support 
         sample_indices_t = torch.tensor(
             sample_indices,
             device="cpu",
-            dtype=torch.long,
+            dtype=idx_dtype,
             pin_memory=pin_memory,
         )
         prompt_tensor = torch.tensor(
             prompt_padded_tokens,
             device="cpu",
-            dtype=torch.long,
+            dtype=idx_dtype,
             pin_memory=pin_memory,
         )
         output_tensor = torch.tensor(
             output_padded_tokens,
             device="cpu",
-            dtype=torch.long,
+            dtype=idx_dtype,
             pin_memory=pin_memory,
         )
         # need to transpose and make contiguous to
@@ -522,7 +523,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         sampling_seeds_t = torch.tensor(
             sampling_seeds,
             device="cpu",
-            dtype=torch.long,
+            dtype=idx_dtype,
             pin_memory=pin_memory,
         ).T.contiguous()
 
@@ -571,7 +572,8 @@ def _get_sequence_seeds(
             else:
                 generator = random.Random(str((seed, ) + extra_entropy))
                 randint_fn = generator.randint
-            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
+            idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support 
+            lo, hi = torch.iinfo(idx_dtype).min, torch.iinfo(idx_dtype).max
             # If the user/random sets seed = 0 but request should
             # have sampling, we need to change it to something
             # else. We use a constant in that case.

From cf6952d3ba9bf194146d6cfa801649283f038d0c Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 5 Jun 2024 14:31:50 +0200
Subject: [PATCH 028/341] Add host memory profiling to HabanaMemoryProfiler
 (#51)

---
 vllm/executor/habana_executor.py   |  3 +-
 vllm/utils.py                      | 40 +++++++++++++++++++++------
 vllm/worker/habana_model_runner.py | 44 ++++++++++++++++--------------
 3 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 5c2cc7e958f96..cfad194bf9cca 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -80,8 +80,7 @@ def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None:
 
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        logger.info(f"init_cache_engine took "
-                    f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+        logger.info(f"init_cache_engine took {cache_init_m.get_summary_string()}")
 
     def execute_model(
             self,
diff --git a/vllm/utils.py b/vllm/utils.py
index 456c5602cf9d3..6d6d3d4f4590d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -496,33 +496,55 @@ class HabanaMemoryProfiler:
     def __init__(self, device=None):
         self.device = device
 
-    def current_memory_usage() -> float:
-        # Return the memory usage in bytes.
+    def current_device_memory_usage() -> float:
+        # Return the device memory usage in bytes.
         free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory - free_hpu_memory
     
-    def current_free_memory() -> float:
-        # Return the memory usage in bytes.
+    def current_free_device_memory() -> float:
+        # Return the device memory usage in bytes.
         free_hpu_memory, _ = torch.hpu.mem_get_info()
         return free_hpu_memory
     
-    def total_memory() -> float:
-        # Return the memory usage in bytes.
+    def total_device_memory() -> float:
+        # Return the device memory usage in bytes.
         _, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory
 
+    def current_host_memory_usage() -> float:
+        # Return the host memory usage in bytes.
+        return HabanaMemoryProfiler.total_host_memory() - HabanaMemoryProfiler.current_free_host_memory()
+    
+    def current_free_host_memory() -> float:
+        # Return the host memory usage in bytes.
+        return psutil.virtual_memory().available
+    
+    def total_host_memory() -> float:
+        # Return the host memory usage in bytes.
+        return psutil.virtual_memory().total
+
+    def get_summary_string(self):
+        if getattr(self, 'final_device_memory', None) is None or getattr(self, 'final_host_memory', None) is None:
+            raise RuntimeError("HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager")
+        return (f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and "
+                f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
+
     def __enter__(self):
         # Force garbage collection
         gc.collect()
-        self.initial_memory = HabanaMemoryProfiler.current_memory_usage()
+        self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage()
+        self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage()
         # This allows us to call methods of the context manager if needed
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # Force garbage collection
         gc.collect()
-        self.final_memory = HabanaMemoryProfiler.current_memory_usage()
-        self.consumed_memory = self.final_memory - self.initial_memory
+        self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage()
+        self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage()
+        self.consumed_device_memory = self.final_device_memory - self.initial_device_memory
+        self.consumed_host_memory = self.final_host_memory - self.initial_host_memory
+        
 
 
 # Adapted from https://stackoverflow.com/a/49361727
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 5b9dff97d75e9..78290fd59b10a 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -271,21 +271,25 @@ def __init__(
 
     def load_model(self) -> None:
         with HabanaMemoryProfiler() as m:
-            self.model = get_model(
-                model_config=self.model_config,
-                device_config=self.device_config,
-                load_config=self.load_config,
-                lora_config=self.lora_config,
-                vision_language_config=self.vision_language_config,
-                parallel_config=self.parallel_config,
-                scheduler_config=self.scheduler_config,
-            )
-            # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
-            self.model = _maybe_wrap_in_hpu_graph(self.model)
+            with HabanaMemoryProfiler() as m_getmodel:
+                self.model = get_model(
+                    model_config=self.model_config,
+                    device_config=self.device_config,
+                    load_config=self.load_config,
+                    lora_config=self.lora_config,
+                    vision_language_config=self.vision_language_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                )
+            logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}")
 
-        self.model_memory_usage = m.consumed_memory
-        logger.info(f"Loading model weights took "
-                    f"{format_bytes(self.model_memory_usage)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)")
+            # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
+            with HabanaMemoryProfiler() as m_wrap:
+                self.model = _maybe_wrap_in_hpu_graph(self.model)
+            logger.info(f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}")
+            
+        self.model_memory_usage = m.consumed_device_memory
+        logger.info(f"Loading model weights took in total {m.get_summary_string()}")
 
         if self.lora_config:
             assert hasattr(self.model, "supported_lora_modules"
@@ -932,12 +936,12 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
         gc.collect()
 
     def log_warmup(self, phase, i, max_i, batch_size, seq_len):
-        free_mem = format_bytes(HabanaMemoryProfiler.current_free_memory())
+        free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory())
         logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}")
 
     def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
         for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
-            mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory()
+            mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage() / HabanaMemoryProfiler.total_device_memory()
             self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len)
             self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
 
@@ -966,7 +970,7 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
             self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
             with HabanaMemoryProfiler() as mem_prof:
                 self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
-            used_mem = align_workers(mem_prof.consumed_memory, torch.distributed.ReduceOp.MAX)
+            used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX)
             available_mem -= used_mem
             total_mem += used_mem
             total_batch_seq += batch_seq
@@ -980,14 +984,14 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             logger.info("Skipping warmup...")
             return
         self.profiler.start('internal', 'warmup')
-        start_mem = HabanaMemoryProfiler.current_memory_usage()
+        start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
         self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
         self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 
         if not self.enforce_eager:
             mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
-            free_mem = mem_margin * HabanaMemoryProfiler.current_free_memory()
+            free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory()
             free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN)
             prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
             prompt_available_memory = prompt_graph_mem_ratio * free_mem
@@ -998,7 +1002,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory)
 
         end_time = time.perf_counter()
-        end_mem = HabanaMemoryProfiler.current_memory_usage()
+        end_mem = HabanaMemoryProfiler.current_device_memory_usage()
         elapsed_time = end_time - start_time
         logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory")
         self.profiler.end()

From d3e64dc17c8c2881a443260d03d968cba333fee0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 6 Jun 2024 14:45:17 +0200
Subject: [PATCH 029/341] Update requirements-hpu.txt (#52)

---
 requirements-hpu.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 21666eb116c22..339fe989bdb7a 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for HPU code
-ray == 2.9.3
+ray == 2.23.0
 triton
 pandas
-tabulate
\ No newline at end of file
+tabulate

From 0b70e5075453a366f033db13ae281a624606098b Mon Sep 17 00:00:00 2001
From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com>
Date: Fri, 7 Jun 2024 11:19:23 +0200
Subject: [PATCH 030/341] Skip incompatible tests with HPU (#46)

* Fix setup.py for HPU

* Fix  vllm._C import ops -> vllm.hpu import ops

* more of the same thing

* re-add hpex rmsnorm and rope; but rope is crashing

* remove unnecessary comments

* add vllm/hpu files

* add hpu autodetection

* Add HabanaAttention stub

* revert accidental changes

* revert non-habana backend attention changes

* add habana attention/worker/executor, sampling fails now

* Restore unnecessarily changed files

* enable HabanaMemoryProfiler

* Make sampler pass

* restore habana fused rope

* prefill is now working!!!

* fix prefill padding; decode is now working!!!!!

* revert accidental changes

* remove unused stuff in habana_paged_attn.py

* remove diagnostic stuff from llm_engine.py

* use HabanaExecutorAsync in async_llm_engine.py

* add habana copyright headers to habana_*.py files

* fix prefill attention conformance

* minor naming fixes

* remove naive attention from habana_attn (it never worked anyway)

* re-enable profile run

* Add fake HPUGraph support

* add more metrics

* indentation fix

* ~~recipe cache metrics don't work lalalala~~

* i'm done with metrics for now

* fix corner case in which hl-smi is not available but synapse is

* FIXME: temporary setup.py workaround

* WIP: add tensor parallelism stubs

* habana worker cleanup

* tensor parallelism is now working

* remove unused files

* remove unused func

* add hpugraphrunner

* improve hpu layernorm

* Port pipelined PA

* Port context length bucketing

* remove cudagraphrunner from hpu runner

* restore HPUGraphRunner back from FakeHPUGraphRunner

* handle rotary embeddings properly on gaudi3

* oopsie! captured_block_counts was incorrect!

* captured_block_counts.append doesn't do anything

* Restore habana_main KV cache memory layout

* fix memory profiler

* overhaul hpugraph capture

* Enable attention tests

* Add geneeric changes

* Enable activation tests

* Enable cache tests: reshape & cache

* Enable layernorm tests

* Decouple reshape_and_cache prompt and decode tests and change slot mapping generation in prompt tests

* Decrease max seq len in attention UTs

* Enable pos_encoding tests

* Enable cache copy tests

* Remove gpu migration from unit tests

* skip incompatible on HPU tests

* Fix noisy lines

* Update sampling_metadata.py

Outdated changes

* Update test_cache.py; fix code style

* fix attention test after rebase

* disable rotary embedding tests for hpu

* restore oryginal rotary embedding tests

* disable multiple sampling test

* disable all metrics tests

* disable some models tests

* disable some sampler tests

* restore recently disabled tests

---------

Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: Tomasz Krupa <tkrupa@habana.ai>
Co-authored-by: Artur Fierka <afierka@habana.ai>
---
 tests/async_engine/test_api_server.py         |   2 +
 tests/async_engine/test_openapi_server_ray.py |   3 +
 .../test_basic_correctness.py                 |   2 +
 .../basic_correctness/test_chunked_prefill.py |   2 +
 tests/basic_correctness/test_preemption.py    |   6 +
 tests/core/block/e2e/test_correctness.py      |   7 +
 tests/core/test_chunked_prefill_scheduler.py  |   7 +
 tests/core/test_scheduler.py                  |   8 +
 tests/distributed/test_pynccl.py              |  24 ++-
 tests/distributed/test_pynccl_library.py      |   3 +
 tests/engine/test_computed_prefix_blocks.py   |   2 +
 tests/engine/test_skip_tokenizer_init.py      |   2 +
 tests/engine/test_stop_reason.py              |   2 +
 tests/engine/test_stop_strings.py             |   3 +
 tests/entrypoints/test_openai_server.py       |   3 +
 .../test_server_oot_registration.py           |   4 +-
 tests/kernels/test_activation.py              |  25 ++-
 tests/kernels/test_attention.py               |  68 ++++--
 tests/kernels/test_cache.py                   | 193 +++++++++++++++---
 tests/kernels/test_layernorm.py               |  18 +-
 tests/kernels/test_moe.py                     |   3 +
 tests/kernels/test_pos_encoding.py            |  19 +-
 tests/kernels/test_prefix_prefill.py          |  10 +-
 tests/kernels/test_rand.py                    |   2 +
 tests/kernels/test_sampler.py                 |   4 +
 tests/lora/test_baichuan.py                   |   3 +
 tests/lora/test_chatglm3.py                   |   4 +
 tests/lora/test_gemma.py                      |   4 +
 tests/lora/test_layer_variation.py            |   2 +
 tests/lora/test_layers.py                     |   6 +
 tests/lora/test_llama.py                      |   4 +
 tests/lora/test_lora.py                       |   4 +
 tests/lora/test_lora_manager.py               |   9 +
 tests/lora/test_punica.py                     |   4 +
 tests/lora/test_quant_model.py                |   2 +
 tests/lora/test_worker.py                     |   4 +
 tests/metrics/test_metrics.py                 |   4 +
 tests/models/test_aqlm.py                     |  15 +-
 tests/models/test_big_models.py               |   4 +-
 tests/models/test_fp8.py                      |  13 +-
 tests/models/test_gptq_marlin.py              |  13 +-
 tests/models/test_llava.py                    |   2 +
 tests/models/test_marlin.py                   |  13 +-
 tests/models/test_mistral.py                  |   2 +
 tests/models/test_models.py                   |   3 +
 tests/models/test_oot_registration.py         |   1 +
 tests/quantization/test_configs.py            |   5 +-
 tests/quantization/test_fp8.py                |  10 +-
 tests/samplers/test_beam_search.py            |   2 +
 tests/samplers/test_logits_processor.py       |   2 +
 tests/samplers/test_logprobs.py               |   2 +
 tests/samplers/test_ranks.py                  |   2 +
 tests/samplers/test_rejection_sampler.py      |  20 +-
 tests/samplers/test_sampler.py                |  30 +--
 tests/samplers/test_seeded_generate.py        |   2 +
 tests/spec_decode/e2e/test_compatibility.py   |   5 +
 tests/spec_decode/e2e/test_logprobs.py        |   6 +
 .../e2e/test_multistep_correctness.py         |  12 ++
 .../spec_decode/e2e/test_ngram_correctness.py |   5 +
 tests/spec_decode/test_batch_expansion.py     |   2 +
 tests/spec_decode/test_metrics.py             |   6 +
 tests/spec_decode/test_multi_step_worker.py   |   6 +
 tests/spec_decode/test_ngram_worker.py        |   5 +
 tests/spec_decode/test_spec_decode_worker.py  |   9 +-
 tests/tensorizer_loader/test_tensorizer.py    |  10 +
 tests/test_config.py                          |   3 +
 tests/test_logits_processor.py                |  13 +-
 tests/tokenization/test_detokenize.py         |   9 +
 tests/worker/test_model_runner.py             |   6 +-
 tests/worker/test_swap.py                     |   4 +-
 vllm/hpu/cache_ops.py                         |   4 +-
 vllm/utils.py                                 |  12 +-
 72 files changed, 601 insertions(+), 129 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 7f57d5cf9b182..8b0e79cf9a6ee 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -6,6 +6,7 @@
 
 import pytest
 import requests
+from vllm.utils import is_hpu
 
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
@@ -44,6 +45,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
     uvicorn_process.terminate()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
 @pytest.mark.parametrize("worker_use_ray", [False, True])
 @pytest.mark.parametrize("engine_use_ray", [False, True])
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 4b97af88012b9..2dd1d74c7eba6 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -10,6 +10,7 @@
 # and debugging.
 import ray
 import requests
+from vllm.utils import is_hpu
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 # any model with a chat template should work here
@@ -57,6 +58,8 @@ def __del__(self):
 
 @pytest.fixture(scope="session")
 def server():
+    if is_hpu():
+        pytest.skip("Skipping test on HPU")
     ray.init()
     server_runner = ServerRunner.remote([
         "--model",
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index d75279dd9cfa9..27468c6054258 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -5,6 +5,7 @@
 import os
 
 import pytest
+from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
@@ -13,6 +14,7 @@
 VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 47d582c726c66..0b66f10f29acc 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,6 +7,7 @@
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest
+from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
@@ -14,6 +15,7 @@
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index ffb0717b3bfdb..b9d46cb8f5f52 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -10,6 +10,7 @@
 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
+from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
@@ -21,6 +22,7 @@
     "tests/basic_correctness/test_preemption.py`")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -67,6 +69,7 @@ def test_chunked_prefill_recompute(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -102,6 +105,7 @@ def test_preemption(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -139,6 +143,7 @@ def test_swap(
                 f"vLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -183,6 +188,7 @@ def test_swap_infeasible(
     assert req_outputs[0].outputs[0].finish_reason == "length"
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index c3666da7542b5..00fb9223c742e 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -3,8 +3,10 @@
 import pytest
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -77,6 +79,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -138,6 +141,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -230,6 +234,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [
@@ -300,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -375,6 +381,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
     assert baseline_token_ids == test_token_ids
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 92498c0014666..ca43a7701ac39 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -7,6 +7,7 @@
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup
+from vllm.utils import is_hpu
 
 from .utils import create_dummy_prompt
 
@@ -27,6 +28,7 @@ def schedule_and_update_computed_tokens(scheduler):
     return metas, out
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_simple():
     """Verify basic scheduling works."""
     block_size = 4
@@ -69,6 +71,7 @@ def test_simple():
     assert len(seq_group_meta) == num_seq_group
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chunk():
     """Verify prefills are chunked properly."""
     block_size = 4
@@ -113,6 +116,7 @@ def test_chunk():
     assert out.num_batched_tokens == 57
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_complex():
     block_size = 4
     max_seqs = 60
@@ -176,6 +180,7 @@ def test_complex():
     assert running[2].is_prefill()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_maximal_decoding():
     """Verify decoding requests are prioritized."""
     block_size = 4
@@ -369,6 +374,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert out.blocks_to_swap_out == {}
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_running_prefill_prioritized_over_swap():
     block_size = 4
     max_seqs = 30
@@ -517,6 +523,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chunked_prefill_max_seqs():
     block_size = 4
     max_seqs = 2
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 3f0c918a89abb..3f45d55520934 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -11,6 +11,7 @@
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
+from vllm.utils import is_hpu
 
 from .utils import create_dummy_prompt
 
@@ -77,6 +78,7 @@ def test_scheduler_abort_seq_group():
     assert scheduler.get_num_unfinished_seq_groups() == 0
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_schedule_simple():
     block_size = 4
     num_seq_group = 4
@@ -144,6 +146,7 @@ def test_scheduler_prefill_prioritized():
     assert get_sequence_groups(out) == [seq_group_b]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_schedule_preempt_abort():
     block_size = 4
     max_model_len = 16
@@ -192,6 +195,7 @@ def test_scheduler_schedule_preempt_abort():
     assert scheduler.get_num_unfinished_seq_groups() == 1
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_max_seqs():
     block_size = 4
     num_seq_group = 4
@@ -233,6 +237,7 @@ def test_scheduler_max_seqs():
     assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_delay_factor():
     block_size = 4
     scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
@@ -270,6 +275,7 @@ def test_scheduler_delay_factor():
     append_new_token(out, 1)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_swapped_out_prioritized():
     scheduler = initialize_scheduler(max_num_seqs=6)
     # best_of=2 * 3 == 6 sequences.
@@ -571,6 +577,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert output.blocks_to_copy == []
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_decode_swap_beam_search():
     """
     Test best_of > 1 swap out blocks
@@ -621,6 +628,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert output.blocks_to_copy == []
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_schedule_decode_blocks_to_copy_update():
     """
     Verify blocks_to_copy is updated.
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index b6f461b76ed03..1e0b85a1a17f0 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -3,14 +3,17 @@
 import pytest
 import torch
 
-import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
-from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
-from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
-                                                          ncclGetUniqueId)
-from vllm.distributed.parallel_state import (
-    ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
-    init_distributed_environment, with_pynccl_for_all_reduce)
-from vllm.utils import update_environment_variables
+from vllm.utils import is_hpu, update_environment_variables
+
+if not is_hpu():    
+    import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils
+    from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+    from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
+                                                            ncclGetUniqueId)
+    from vllm.distributed.parallel_state import (
+        ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group,
+        init_distributed_environment, with_pynccl_for_all_reduce)
+
 
 
 def distributed_run(fn, world_size):
@@ -56,6 +59,7 @@ def worker_fn():
     assert result == comm.world_size
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 def test_pynccl():
@@ -84,6 +88,7 @@ def multiple_tp_worker_fn():
         assert result == 2
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_tp():
@@ -113,6 +118,7 @@ def multiple_tp_with_vllm_worker_fn():
             assert result == 2
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_tp_with_vllm():
@@ -140,12 +146,14 @@ def worker_fn_with_cudagraph():
         assert a.mean().cpu().item() == comm.world_size**1
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 def test_pynccl_with_cudagraph():
     distributed_run(worker_fn_with_cudagraph, 2)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ncclGetUniqueId():
     unique_id = ncclGetUniqueId()
     # `list(unique_id.internal)` is something like this:
diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py
index ec60a5ed3114d..67533a5866b55 100644
--- a/tests/distributed/test_pynccl_library.py
+++ b/tests/distributed/test_pynccl_library.py
@@ -1,6 +1,8 @@
 import multiprocessing
 import tempfile
 
+import pytest
+from vllm.utils import is_hpu
 
 def target_fn(env, filepath):
     from vllm.utils import update_environment_variables
@@ -9,6 +11,7 @@ def target_fn(env, filepath):
     nccl_integrity_check(filepath)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_library_file():
     # note: don't import vllm.distributed.device_communicators.pynccl
     # before running this test, otherwise the library file will be loaded
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ed35212cc3f11..ec64cdd9749ff 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -3,8 +3,10 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index baa463a316902..169c9186599cd 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,8 +2,10 @@
 
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index b2f521a8ae4ce..c3109330785b7 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -10,6 +10,7 @@
 import transformers
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 MODEL = "facebook/opt-350m"
 STOP_STR = "."
@@ -24,6 +25,7 @@ def vllm_model(vllm_runner):
     del vllm_model
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_stop_reason(vllm_model, example_prompts):
     tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
     stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 6b747beb4b543..61d3cd485b80d 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm import CompletionOutput, LLMEngine, SamplingParams
+from vllm.utils import is_hpu
 
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
@@ -10,6 +11,8 @@
 
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
+    if is_hpu():
+        pytest.skip("Skipping test on HPU")
     return vllm_runner(MODEL)
 
 
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index e53e64a0c1ff8..e5fe2246f9934 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -19,6 +19,7 @@
 from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_hpu
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 # any model with a chat template should work here
@@ -123,6 +124,8 @@ def zephyr_lora_files():
 
 @pytest.fixture(scope="session")
 def server(zephyr_lora_files):
+    if is_hpu():
+        pytest.skip("Skipping test on HPU")
     ray.init()
     server_runner = ServerRunner.remote([
         "--model",
diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py
index 22e65bf7e7da1..c65d90a80e14c 100644
--- a/tests/entrypoints/test_server_oot_registration.py
+++ b/tests/entrypoints/test_server_oot_registration.py
@@ -2,13 +2,14 @@
 import sys
 import time
 
+import pytest
 import torch
 from openai import OpenAI, OpenAIError
 
 from vllm import ModelRegistry
 from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.utils import get_open_port
+from vllm.utils import get_open_port, is_hpu
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
@@ -32,6 +33,7 @@ def server_function(port):
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_oot_registration_for_api_server():
     port = get_open_port()
     server = multiprocessing.Process(target=server_function, args=(port, ))
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 86ecc6412c648..7c6ed07ba61f0 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -6,14 +6,18 @@
 
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
                                                    NewGELU, SiluAndMul)
+from vllm.utils import is_hpu
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
 @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
@@ -21,7 +25,7 @@
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_act_and_mul(
     activation: str,
@@ -31,9 +35,15 @@ def test_act_and_mul(
     seed: int,
     device: str,
 ) -> None:
+
+    if is_hpu() and activation != "silu":
+        pytest.skip("Only SiluAndMul supported on HPU.")
+
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
     if activation == "silu":
@@ -54,7 +64,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_activation(
     activation: Type[torch.nn.Module],
@@ -64,9 +74,14 @@ def test_activation(
     seed: int,
     device: str,
 ) -> None:
+    if is_hpu():
+        pytest.skip("GELU not supported on HPU.")
+
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
     layer = activation()
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 84539205e0ae3..f17a51b0ccf78 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -4,16 +4,21 @@
 import pytest
 import torch
 from allclose_default import get_default_atol, get_default_rtol
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
-from vllm import _custom_ops as ops
-from vllm.utils import get_max_shared_memory_bytes, is_hip
+from vllm.utils import get_max_shared_memory_bytes, is_hip, is_hpu
+if is_hpu():
+    from vllm.hpu import ops, cache_ops
+    from vllm.hpu import xops
+    from vllm.hpu.attn_bias import BlockDiagonalCausalMask
+else:
+    from vllm._C import ops, cache_ops
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
-MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 if not is_hpu() else 128
 # There may not be enough gpu memory due to large NUM_BLOCKS.
 # Reduce NUM_BLOCKS when it happens.
 NUM_BLOCKS = 4321  # Arbitrary values for testing
@@ -34,9 +39,12 @@
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
 def ref_masked_attention(
@@ -84,7 +92,10 @@ def ref_single_query_cached_kv_attention(
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
-            k = key_cache[block_number, :, :, block_offset, :]
+            if is_hpu():
+                k = key_cache[block_number, :, :, block_offset]
+            else:
+                k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
             keys.append(k)
 
@@ -119,7 +130,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_paged_attention(
     kv_cache_factory,
     version: str,
@@ -133,14 +144,24 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
+    if is_hpu():
+        if version != "v1":
+            pytest.skip("Paged attention v2 not supported on HPU")
+        if kv_cache_dtype != "auto":
+            pytest.skip("Only auto kv_cache_dtype supported on HPU")
+        if use_alibi:
+            pytest.skip("Alibi not supported on HPU")
+
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
-    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype, device=device)
     query.uniform_(-scale, scale)
 
     assert num_query_heads % num_kv_heads == 0
@@ -163,7 +184,7 @@ def test_paged_attention(
             for _ in range(max_num_blocks_per_seq)
         ]
         block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int)
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
@@ -177,7 +198,21 @@ def test_paged_attention(
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
-    if version == "v1":
+
+    if is_hpu():
+        output = ops.paged_attention_v1(
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            block_tables,
+            seq_lens,
+            block_size,
+            alibi_slopes,
+            kv_cache_dtype,
+        )
+    elif version == "v1":
         ops.paged_attention_v1(
             output,
             query,
@@ -307,12 +342,13 @@ def ref_multi_query_kv_attention(
 
 
 # TODO(woosuk): Add tests for USE_ALIBI=True.
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
@@ -339,7 +375,8 @@ def test_multi_query_kv_attention(
     qkv = torch.empty(num_tokens,
                       num_query_heads + 2 * num_kv_heads,
                       head_size,
-                      dtype=dtype)
+                      dtype=dtype,
+                      device=device)
     qkv.uniform_(-scale, scale)
     query, key, value = qkv.split(
         [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
@@ -373,4 +410,5 @@ def test_multi_query_kv_attention(
     )
     atol = get_default_atol(output) if is_hip() else 1e-3
     rtol = get_default_rtol(output) if is_hip() else 1e-5
+
     assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 94a577139596e..f8163deb05223 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,14 +1,18 @@
 import random
 from typing import Tuple
 
+import math
 import pytest
 import torch
 
-from vllm import _custom_ops as ops
-from vllm._C import cache_ops
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_hpu
+from vllm import _custom_ops as ops	
+if is_hpu():
+    from vllm.hpu import cache_ops
+else:
+    from vllm._C import cache_ops
+
 
-COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -22,9 +26,14 @@
 
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    COPYING_DIRECTION = [('hpu', 'cpu'), ('hpu', 'hpu'), ('cpu', 'hpu')]
+    DEVICES = ["hpu"]
+else:
+    COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 
 
@@ -36,8 +45,8 @@
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_copy_blocks(
     kv_cache_factory,
@@ -52,10 +61,15 @@ def test_copy_blocks(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
+    if is_hpu() and kv_cache_dtype != "auto":
+        pytest.skip("Only auto kv_cache_dtype supported on HPU")
+
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
@@ -78,14 +92,25 @@ def test_copy_blocks(
                                                 dtype, seed, device)
 
     # Clone the KV caches.
-    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
-    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
+    cloned_key_caches = [key_cache.clone().to("cpu") for key_cache in key_caches]
+    cloned_value_caches = [value_cache.clone().to("cpu") for value_cache in value_caches]
 
     # Call the copy blocks kernel.
     block_mapping_tensor = torch.tensor(block_mapping,
                                         dtype=torch.int64,
                                         device=device).view(-1, 2)
-    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
+    if is_hpu():
+        tmp_block_mapping_dict = {}
+        for src, dst in block_mapping:
+            print(src, dst, tmp_block_mapping_dict)
+            if not tmp_block_mapping_dict.get(src):
+                tmp_block_mapping_dict[src] = [dst]
+                continue
+            tmp_block_mapping_dict[src].append(dst)
+
+        ops.copy_blocks(key_caches, value_caches, tmp_block_mapping_dict)
+    else:
+        ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
     for src, dst in block_mapping:
@@ -109,7 +134,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_reshape_and_cache(
@@ -126,14 +151,20 @@ def test_reshape_and_cache(
 ) -> None:
     if not is_hip() and kv_cache_dtype == "fp8":
         pytest.skip()  # This test is not tuned for e5m2 cuda precision
+    if is_hpu() and kv_cache_dtype != "auto":
+        pytest.skip("Only auto kv_cache_dtype supported on HPU")
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
+
     # Create a random slot mapping.
-    num_slots = block_size * num_blocks
-    slot_mapping = random.sample(range(num_slots), num_tokens)
+    blocks = random.sample(range(num_blocks), num_tokens)
+    offsets = random.choices(range(block_size), k=num_tokens)
+    slot_mapping = [block * block_size + offset for block, offset in zip(blocks, offsets)]
     slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
 
     qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
@@ -160,25 +191,29 @@ def test_reshape_and_cache(
     kv_scale = 1.0
 
     # Call the reshape_and_cache kernel.
-    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, kv_scale)
-
-    if kv_cache_dtype == "fp8":
-        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(key_cache, result_key_cache)
-        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(value_cache, result_value_cache)
+    if is_hpu():
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping.view((1, -1)), "auto", False)
+    else:
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping, "auto")
 
     # Run the reference implementation.
-    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies = block_indicies.cpu().tolist()
+    if is_hpu():
+        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape)
+    else:
+        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = block_indices.cpu().tolist()
     block_offsets = slot_mapping % block_size
     block_offsets = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indicies[i]
+        block_idx = block_indices[i]
         block_offset = block_offsets[i]
-        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+        if is_hpu():
+            cloned_key_cache[block_idx, :, :, block_offset] = reshaped_key[i]
+        else:
+            cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
     if kv_cache_dtype == "fp8":
@@ -195,6 +230,7 @@ def test_reshape_and_cache(
         assert torch.allclose(value_cache, cloned_value_cache)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -202,7 +238,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_reshape_and_cache_flash(
@@ -279,7 +315,7 @@ def test_reshape_and_cache_flash(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_swap_blocks(
@@ -295,6 +331,8 @@ def test_swap_blocks(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
+    if is_hpu() and direction[0] == "hpu" and direction[1] == "cpu":
+        pytest.skip("Skipping test on HPU")
     if kv_cache_dtype == "fp8" and "cpu" in direction:
         pytest.skip()
     if not is_hip() and kv_cache_dtype == "fp8":
@@ -303,9 +341,15 @@ def test_swap_blocks(
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
 
-    src_device = device if direction[0] == "cuda" else 'cpu'
-    dst_device = device if direction[1] == "cuda" else 'cpu'
+    if is_hpu():
+        src_device = device if direction[0] == "hpu" else 'cpu'
+        dst_device = device if direction[1] == "hpu" else 'cpu'
+    else:
+        src_device = device if direction[0] == "cuda" else 'cpu'
+        dst_device = device if direction[1] == "cuda" else 'cpu'
 
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
@@ -341,6 +385,7 @@ def test_swap_blocks(
                               dist_value_caches[0][dst].cpu())
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3")
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -348,7 +393,7 @@ def test_swap_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_fp8_conversion(
     num_heads: int,
@@ -376,3 +421,87 @@ def test_fp8_conversion(
     ops.convert_fp8(cache_fp8, converted_cache)
 
     assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
+
+
+@pytest.mark.skipif(not is_hpu(), reason="This case is HPU-specific")
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_reshape_and_cache_prompt(
+    kv_cache_factory,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # Create a random slot mapping.
+    num_block_indices_to_generate = math.ceil(num_tokens / block_size)
+    block_indices_ = random.sample(range(num_blocks), num_block_indices_to_generate)
+    block_offsets_ = []
+    slot_mapping = []
+    for i in block_indices_:
+        for j in range(block_size):
+            slot_mapping.append(i * block_size + j)
+    slot_mapping = slot_mapping[:num_tokens]
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
+    _, key, value = qkv.unbind(dim=1)
+
+    # Create the KV caches.
+    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
+                                                num_heads, head_size, dtype,
+                                                None, seed, device)
+    key_cache, value_cache = key_caches[0], value_caches[0]
+
+    # Clone the KV caches.
+    cloned_key_cache = key_cache.clone()
+    cloned_value_cache = value_cache.clone()
+
+    # Call the reshape_and_cache kernel.
+    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                slot_mapping.view((1, -1)), "auto", True)
+
+    # Run the reference implementation.
+    if is_hpu():
+        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape)
+    else:
+        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = block_indices.cpu().tolist()
+    block_offsets = slot_mapping % block_size
+    block_offsets = block_offsets.cpu().tolist()
+    for i in range(0, num_tokens):
+        block_idx = block_indices[i]
+        block_offset = block_offsets[i]
+        cloned_key_cache[block_idx, :, :, block_offset] = key[i, :, :]
+        cloned_value_cache[block_idx, :, :, block_offset] = value[i, :, :]
+
+    # Note: only checking cache areas specified by the slot mapping because
+    # the implementation may initialize whole blocks even if some of the offsets of the block
+    # are not present in the slot mapping.
+    for i in range(0, num_tokens):
+        block_idx = block_indices[i]
+        block_offset = block_offsets[i]
+        assert torch.allclose(key_cache[block_idx, :, :, block_offset],
+                              cloned_key_cache[block_idx, :, :, block_offset])
+        assert torch.allclose(value_cache[block_idx, :, :, block_offset],
+                              cloned_value_cache[block_idx, :, :, block_offset])
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 210d59e4f32fa..54385c6074068 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -2,6 +2,7 @@
 import torch
 
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.utils import is_hpu
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
@@ -9,9 +10,12 @@
                 8199]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -19,7 +23,7 @@
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
     num_tokens: int,
@@ -29,14 +33,18 @@ def test_rms_norm(
     seed: int,
     device: str,
 ) -> None:
+    if is_hpu() and dtype == torch.half and add_residual:
+        pytest.skip("Skipping test on HPU")
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
+    elif is_hpu():
+        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     layer = RMSNorm(hidden_size).to(dtype=dtype)
     layer.weight.data.normal_(mean=1.0, std=0.1)
     scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x = torch.randn(1, num_tokens, hidden_size, dtype=dtype, device=device)
     x *= scale
     residual = torch.randn_like(x) * scale if add_residual else None
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2356b9ec18b0d..8d52fbaa6cc25 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.models.mixtral import MixtralMoE
+from vllm.utils import is_hpu
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -29,6 +30,7 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -53,6 +55,7 @@ def test_fused_moe(
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index bf1856972cf33..49407acdf1a0e 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -6,6 +6,7 @@
 from allclose_default import get_default_atol, get_default_rtol
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.utils import is_hpu
 
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -15,11 +16,15 @@
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -28,7 +33,7 @@
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_rotary_embedding(
     is_neox_style: bool,
@@ -76,6 +81,7 @@ def test_rotary_embedding(
                           rtol=get_default_rtol(out_key))
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -84,7 +90,7 @@ def test_rotary_embedding(
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_batched_rotary_embedding(
     is_neox_style: bool,
@@ -138,6 +144,7 @@ def test_batched_rotary_embedding(
                           rtol=get_default_rtol(out_key))
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -146,7 +153,7 @@ def test_batched_rotary_embedding(
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_batched_rotary_embedding_multi_lora(
     is_neox_style: bool,
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 5a5987e2242fa..9350ab25c0f86 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -3,10 +3,15 @@
 
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import xops
+    from vllm.hpu.attn_bias import BlockDiagonalCausalFromBottomRightMask
+else:
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
@@ -18,6 +23,7 @@
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py
index a4242d22eb489..a96a238834a49 100644
--- a/tests/kernels/test_rand.py
+++ b/tests/kernels/test_rand.py
@@ -5,8 +5,10 @@
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("use_3d", [True, False])
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index e28f809309ec5..4bab8caedbf62 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -10,6 +10,7 @@
     sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hpu
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
@@ -30,6 +31,7 @@ def _uniform_to_exponential_kernel(input, output, n: tl.constexpr):
     tl.store(output + idx, y)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_uniform_to_exponential():
     """Test that we can convert uniform to exponential without div by 0."""
     input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps],
@@ -42,6 +44,7 @@ def test_uniform_to_exponential():
     assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output))
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
 @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("modify_greedy_probs", [True, False])
@@ -121,6 +124,7 @@ def test_sample_decoding_only(random_sampling, max_best_of,
         assert sampled_logprobs is None
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
 @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("modify_greedy_probs", [True, False])
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 5ab863eea94b3..18fc5dfb55367 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -2,6 +2,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -39,6 +40,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
     return generated_texts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -61,6 +63,7 @@ def test_baichuan_lora(baichuan_lora_files):
         assert output2[i] == expected_lora_output[i]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skip("Requires multiple GPUs")
 def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
     # Cannot use as it will initialize torch.cuda too early...
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py
index bd8cc98ef8ca0..34528c9a6bdcd 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -1,5 +1,8 @@
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 MODEL_PATH = "THUDM/chatglm3-6b"
 
@@ -35,6 +38,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
     return generated_texts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 0082c6e74e888..07219b4502822 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,5 +1,8 @@
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 MODEL_PATH = "google/gemma-7b"
 
@@ -26,6 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str:
     return generated_texts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index 7d37aa6474adc..a4f56a20ce838 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -8,6 +8,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -70,6 +71,7 @@ def do_sample(llm,
 # step 1: init a base model and serve with LoRA to get the reference results
 # step 2: merge the same LoRA to the base model, serve the merged model
 # step 3: compare the results from step 1 and step 2
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("tp_size", [1])
 @pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
 @pytest.mark.parametrize("rank", [8, 16, 32, 64])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 0eb04f4ccd133..db482c9821c73 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -32,6 +32,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hpu
 
 from .utils import DummyLoRAManager
 
@@ -171,6 +172,7 @@ def create_random_inputs(
     return inputs, index_mapping, prompt_mapping
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -265,6 +267,7 @@ def create_random_embedding_layer():
                               atol=atol)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
@@ -402,6 +405,7 @@ def create_random_embedding_layer():
                               atol=atol)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -525,6 +529,7 @@ def _pretest():
                               atol=atol)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
@@ -636,6 +641,7 @@ def create_random_linear_parallel_layer():
                               atol=atol)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index f5a571e81acba..c12fc1a1cc213 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -3,6 +3,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -36,6 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
     return generated_texts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("tp_size", [1])
 def test_llama_lora(sql_lora_files, tp_size):
     # Cannot use as it will initialize torch.cuda too early...
@@ -80,6 +82,7 @@ def test_llama_lora(sql_lora_files, tp_size):
     print("removing lora")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skip("Requires multiple GPUs")
 def test_llama_tensor_parallel_equality(sql_lora_files):
     # Cannot use as it will initialize torch.cuda too early...
@@ -121,6 +124,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files):
     assert output_tp1 == output_tp4
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
     is more conservative"""
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 3415d36b7e341..90363305e137c 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -2,6 +2,7 @@
 import torch
 
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
+from vllm.utils import is_hpu
 
 from .utils import DummyLoRAManager
 
@@ -21,6 +22,7 @@
 }
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
@@ -71,6 +73,7 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
     manager.reset_lora()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
@@ -140,6 +143,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
     manager.reset_lora()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c08eee9910149..5146e22f77d57 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -17,6 +17,7 @@
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.utils import is_hpu
 
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
@@ -26,6 +27,7 @@
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_from_lora_tensors(sql_lora_files):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
@@ -98,6 +100,7 @@ def create_packed_lora(
     return LoRAModel(lora_id, 8, loras)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "layer1.dense2"]
@@ -116,6 +119,7 @@ def test_replace_submodules(dist_init, dummy_model):
                       RowParallelLinearWithLoRA)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lora_model_manager(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -162,6 +166,7 @@ def test_lora_model_manager(dist_init, dummy_model):
     assert manager.lora_index_to_id[1] == 2
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -211,6 +216,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     assert manager.lora_index_to_id[1] == 3
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lru_lora_model_manager(dist_init, dummy_model):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
@@ -289,6 +295,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert all(x is None for x in manager.lora_index_to_id)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                        sql_lora_files):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
@@ -362,6 +369,7 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                              sql_lora_files):
     # Should remove every LoRA not specified in the request.
@@ -432,6 +440,7 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_packed_loras(dist_init, dummy_model_gate_up):
     model = dummy_model_gate_up
     model.supported_lora_modules = ["gate_up_proj"]
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index fd2a1b75f460c..f2d62d5fca0bb 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -4,6 +4,7 @@
 import torch
 
 import vllm.lora.punica as punica
+from vllm.utils import is_hpu
 
 
 def assert_close(a, b):
@@ -102,6 +103,7 @@ def _lora_ref_impl(
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("r", R)
@@ -144,6 +146,7 @@ def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
         assert_close(y_ref, y_our)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("h2", H2)
@@ -178,6 +181,7 @@ def test_lora_correctness(dtype_str, h1, h2, seed, device):
         assert_close(y_ref, y_our)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("h2", H2)
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 3d86a4366aa57..3e8440fd4c25f 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -7,6 +7,7 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -54,6 +55,7 @@ def format_prompt_tuples(prompt):
     return generated_texts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 732e91a52c0a9..943a9170605c2 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,13 +3,17 @@
 import tempfile
 from unittest.mock import patch
 
+import pytest
+
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
     worker = Worker(
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e0aa14f165c2d..f8a4da4349a76 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -7,12 +7,14 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -46,6 +48,7 @@ def test_metric_counter_prompt_tokens(
         f"metric: {metric_count!r}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -78,6 +81,7 @@ def test_metric_counter_generation_tokens(
         f"metric: {metric_count!r}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index a7abc011f57d7..4425fc87e59b9 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -7,11 +7,15 @@
 import torch
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import is_hpu
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (capability <
-                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+if not is_hpu():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability <
+                        QUANTIZATION_METHODS["aqlm"].get_min_capability())
+else:
+    aqlm_not_supported = False
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
@@ -63,7 +67,7 @@
     'The early bird catches the worm.\nThe early bird catches the'
 ]
 
-
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(aqlm_not_supported,
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
@@ -78,7 +82,6 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
                                                        max_tokens,
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 3dde498bcd639..f8425598cd756 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -5,6 +5,7 @@
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
+from vllm.utils import is_hpu
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -17,6 +18,7 @@
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -44,7 +46,7 @@ def test_models(
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
-
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_model_print(
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index e87a1783a83f1..00bdcf578ed15 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -10,6 +10,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import is_hpu
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
@@ -43,12 +44,16 @@
     ],
 }
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (capability <
-                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+if not is_hpu():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability <
+                        QUANTIZATION_METHODS["fp8"].get_min_capability())
+else:
+    fp8_not_supported = True
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 4d73843f970c4..37930e2708eaa 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -17,15 +17,19 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import is_hpu
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 MAX_MODEL_LEN = 1024
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-gptq_marlin_not_supported = (
-    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+if not is_hpu():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+else:
+    gptq_marlin_not_supported = True
 
 MODELS = [
     # act_order==False, group_size=channelwise
@@ -49,6 +53,7 @@
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(gptq_marlin_not_supported,
                     reason="gptq_marlin is not supported on this GPU type.")
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f86cd3fa88f5d..1d2e99cd566e5 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -8,6 +8,7 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.utils import is_hpu
 
 model_and_vl_config = [
     ("llava-hf/llava-1.5-7b-hf",
@@ -62,6 +63,7 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
     return sanitized_input_ids, sanitzied_output_str
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("worker_use_ray", [False])
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index fa846d43d0e88..9b3d4bdba775c 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -17,11 +17,15 @@
 
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import is_hpu
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+if not is_hpu:
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (capability <
+                            QUANTIZATION_METHODS["marlin"].get_min_capability())
+else:
+    marlin_not_supported = True
 
 
 @dataclass
@@ -40,6 +44,7 @@ class ModelPair:
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin is not supported on this GPU type.")
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 7aeff3a913098..2500d572ebefa 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -3,12 +3,14 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
+from vllm.utils import is_hpu
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index e4609620387fa..9fc62f29ed0c9 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -6,6 +6,7 @@
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
+from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
@@ -20,6 +21,7 @@
 ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -51,6 +53,7 @@ def test_models(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_model_print(
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 50ab06631500b..f03c657dac4a2 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 
 from vllm import LLM, ModelRegistry, SamplingParams
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 6820b2728e3c9..b125de8906b9b 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -8,7 +8,7 @@
 import pytest
 
 from vllm.config import ModelConfig
-
+from vllm.utils import is_hpu
 
 @dataclass
 class ModelPair:
@@ -53,7 +53,8 @@ class ModelPair:
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
 def test_auto_gptq(model_arg_exptype: str) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
-
+    if is_hpu() and model_path in ('TheBloke/Llama-2-7B-Chat-GPTQ', 'LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit'):
+        pytest.skip("Skipping test on HPU")
     try:
         model_config = ModelConfig(model_path,
                                    model_path,
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 607544a1c8394..c9ee2d5d05fa4 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -7,11 +7,15 @@
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.utils import is_hpu
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-
+if not is_hpu():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+else:
+    capability = 0
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
     reason="FP8 is not supported on this GPU type.")
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2682f284505bd..176371b5cd166 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -6,6 +6,7 @@
 
 import pytest
 import torch
+from vllm.utils import is_hpu
 
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -16,6 +17,7 @@
 MODELS = ["facebook/opt-125m"]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 3788e9e9752ff..7e95d0fa60c3d 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -2,10 +2,12 @@
 import torch
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 MODELS = ["facebook/opt-125m"]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_logits_processor_force_generate(
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 57d6d2a410ee5..3d2597ab33cce 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -3,10 +3,12 @@
 
 from tests.conftest import VllmRunner
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 MODELS = ["facebook/opt-125m"]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 5e93238d709ec..81d78ed987dac 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,10 +1,12 @@
 import pytest
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 MODELS = ["facebook/opt-125m"]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 13b5b80cccfdc..c96862f3231a3 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -7,10 +7,14 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hpu
 
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
 def mock_causal_accepted_tensor(
@@ -38,11 +42,12 @@ def mock_causal_accepted_tensor(
     return accepted
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", list(range(10)))
 @pytest.mark.parametrize(
     "which_tokens_accepted",
     ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_correct_output_format(which_tokens_accepted: str, seed: int,
                                device: str):
@@ -124,10 +129,11 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
         assert torch.all(output_token_ids[subsequent_mask] == -1)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     device: str):
@@ -150,10 +156,11 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                       draft_token_ids)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
 @pytest.mark.parametrize("which_token_ids",
                          ["bonus_token_ids", "draft_token_ids"])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                                which_token_ids: str, device: str):
@@ -198,6 +205,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                           draft_token_ids)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
 @pytest.mark.parametrize("seed", list(range(5)))
 @torch.inference_mode()
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index e4fea165a4d46..570cbe420f592 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter
+from vllm.utils import Counter, is_hpu
 from vllm.worker.model_runner import ModelRunner
 
 
@@ -44,9 +44,12 @@ def _prepare_test(
 
 VOCAB_SIZE = 32000
 RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
 def _do_sample(
@@ -80,7 +83,7 @@ def _do_sample(
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_all_greedy(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -100,7 +103,7 @@ def test_sampler_all_greedy(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_all_random(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -125,8 +128,9 @@ def test_sampler_all_random(seed: int, device: str):
     del model_runner
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_all_random_seed(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -151,8 +155,9 @@ def test_sampler_all_random_seed(seed: int, device: str):
     del model_runner
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_all_random_seed_deterministic(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -176,7 +181,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_all_beam(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -198,7 +203,7 @@ def test_sampler_all_beam(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_min_tokens_penalty(seed: int, device: str):
     seq_id_counter = Counter(start=random.randint(0, 100))
     set_random_seed(seed)
@@ -486,8 +491,9 @@ def run_test_case(*,
         run_test_case(**test_case)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_mixed(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -589,7 +595,7 @@ def test_sampling(model_runner: ModelRunner):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_sampler_top_k_top_p(seed: int, device: str):
     set_random_seed(seed)
     batch_size = random.randint(1, 256)
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 3cd659cef58da..72a792c2d757f 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -10,6 +10,7 @@
 
 from vllm import SamplingParams
 from vllm.model_executor.utils import set_random_seed
+from vllm.utils import is_hpu
 
 MODEL = "facebook/opt-125m"
 RANDOM_SEEDS = list(range(5))
@@ -22,6 +23,7 @@ def vllm_model(vllm_runner):
     del vllm_model
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_random_sample_with_seed(
     vllm_model,
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 60c20ed7db7a3..7786e864d2a4f 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,10 +1,12 @@
 import pytest
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 from .conftest import get_output_from_llm_generator
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -55,6 +57,7 @@ def test_spec_decode_xfail_ray(test_llm_generator):
         ray.shutdown()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -94,6 +97,7 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
                                       sampling_params)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -146,6 +150,7 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
                                       sampling_params)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("common_llm_kwargs", [{
     "model": "JackFram/llama-68m",
     "speculative_model": "JackFram/llama-68m",
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 9572aac7df6e0..882cb8dd9dbac 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -4,10 +4,12 @@
 import pytest
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 from .conftest import get_logprobs_from_llm_generator
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -45,6 +47,7 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -86,6 +89,7 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
                                          logprob_rank=num_logprobs)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -125,6 +129,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -168,6 +173,7 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index f15fcc4746d20..565936dd50c5d 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -34,11 +34,13 @@
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
+from vllm.utils import is_hpu
 
 from .conftest import (get_output_from_llm_generator,
                        run_greedy_equality_correctness_test)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -110,6 +112,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         assert actual_tokens.strip() == expected_tokens.strip()
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -148,6 +151,7 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -202,6 +206,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -253,6 +258,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -299,6 +305,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
                                          force_output_len=False)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -343,6 +350,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -387,6 +395,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -434,6 +443,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -487,6 +497,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -536,6 +547,7 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 44ef400c91d34..bf2641ff2ffbc 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,9 +26,12 @@
 
 import pytest
 
+from vllm.utils import is_hpu
+
 from .conftest import run_greedy_equality_correctness_test
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -70,6 +73,7 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -119,6 +123,7 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 43cfd78ddb0cc..6013ce17b608f 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -2,6 +2,7 @@
 import torch
 
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.utils import is_hpu
 
 from .utils import create_seq_group_metadata_from_prompts, mock_worker
 
@@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int):
             assert next(iterator) > max_seq_id
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.skip_global_cleanup
 def test_get_token_ids_to_score(k: int):
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 312878804b86e..d85f264c33099 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -5,8 +5,10 @@
 import torch
 
 from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
@@ -25,6 +27,7 @@ def test_initial_call_returns_none():
     assert maybe_metrics is None
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
@@ -52,6 +55,7 @@ def test_second_call_returns_metrics():
     assert metrics is not None
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("rank", [1, 2, 3, 4])
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
@@ -72,6 +76,7 @@ def test_nonzero_rank_noop(rank):
     assert metrics is None
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
@@ -105,6 +110,7 @@ def test_noop_until_time():
     assert metrics is not None
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("has_data", [True, False])
 def test_initial_metrics_has_correct_values(has_data: bool):
     """Test correctness of metrics data.
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index cb2de97a4af94..b907365d6261d 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -8,6 +8,7 @@
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.utils import is_hpu
 from vllm.worker.worker import Worker
 
 from .utils import (assert_logprobs_dict_allclose, create_batch,
@@ -68,6 +69,7 @@ def test_assert_enough_kv_space(num_steps: int):
         seq_group_metadata.block_tables = original_block_tables
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_same_output_for_single_step():
     """Verify the multi step worker produces the same output as the normal
@@ -150,6 +152,7 @@ def test_same_output_for_single_step():
     assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_same_output_for_multi_step():
     """Verify the multi-step worker produces the same output as the normal
@@ -269,6 +272,7 @@ def test_same_output_for_multi_step():
                                       single_step_logprobs)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
@@ -321,6 +325,7 @@ def test_draft_proposals_full_speculation_len():
     assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_no_speculations():
     """Verify Top1Proposer correctly handles case where no sequences
@@ -358,6 +363,7 @@ def test_draft_proposals_no_speculations():
     assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_mixed_k():
     """Verify Top1Proposer correctly handles case some sequences can
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index de305c4030aa9..931a7176b4555 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,12 +1,15 @@
 import torch
 
+import pytest
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.utils import is_hpu
 
 from .utils import create_seq_group_metadata_from_prompts, create_worker
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_single_no_match():
     """Verify our ngram algo find the right candidate in the prompt
 
@@ -63,6 +66,7 @@ def test_ngram_algo_correctness_for_single_no_match():
     assert proposals.proposal_lens.tolist() == [0]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_batches_not_match_all():
     """Verify our ngram algo find the right candidate in the prompt
 
@@ -139,6 +143,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
         assert proposals.proposal_token_ids[4][i] == -1
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_batches_match_all():
     """Verify our ngram algo find the right candidate in the prompt
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index ef9d32f73d668..9076ed3ce6eb0 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -14,10 +14,11 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
+from vllm.utils import is_hpu
 
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
-
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @torch.inference_mode()
@@ -50,6 +51,7 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
         assert actual_execute_model_data == execute_model_req
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @torch.inference_mode()
@@ -130,6 +132,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
     assert expected_seen_contexts == seen_contexts
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @torch.inference_mode()
@@ -218,6 +221,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
     assert torch.equal(actual.draft_probs, proposal_probs)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @torch.inference_mode()
@@ -340,6 +344,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
                 i].output_token
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('returns_metrics', [True, False])
@@ -436,6 +441,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
     assert args[0] == k or kwargs.get('k', -1) == k
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [0])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @torch.inference_mode()
@@ -476,6 +482,7 @@ def test_k_equals_zero(k: int, batch_size: int):
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [0, 5])
 @pytest.mark.parametrize('batch_size', [0])
 @torch.inference_mode()
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index df1db4e6c4001..9426dc2e6d45f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -14,6 +14,7 @@
 from vllm.model_executor.model_loader.tensorizer import (
     EncryptionParams, TensorizerConfig, TensorSerializer,
     is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)
+from vllm.utils import is_hpu
 
 prompts = [
     "Hello, my name is",
@@ -74,6 +75,7 @@ def test_is_vllm_model_without_vllm_in_uri(tensorizer_config):
     assert result is False
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
     vllm_model = vllm_runner(model_ref)
     model_path = tmp_path / (model_ref + ".tensors")
@@ -99,6 +101,7 @@ def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
     assert outputs == deserialized_outputs
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
@@ -118,6 +121,7 @@ def test_can_deserialize_s3(vllm_runner):
     assert deserialized_outputs
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
@@ -151,6 +155,7 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
     assert outputs == deserialized_outputs
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                 tmp_path):
     hf_model = hf_runner(model_ref)
@@ -176,6 +181,7 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
     assert outputs == deserialized_outputs
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     from huggingface_hub import snapshot_download
 
@@ -217,6 +223,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     assert loaded_vllm_model
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_load_without_tensorizer_load_format(vllm_runner):
     with pytest.raises(ValueError):
         vllm_runner(model_ref,
@@ -224,6 +231,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
                         tensorizer_uri="test", vllm_tensorized=False))
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_tensorize_vllm_model(tmp_path):
     # Test serialize command
@@ -251,6 +259,7 @@ def test_tensorize_vllm_model(tmp_path):
                                     f"\n{result.stdout}\n{result.stderr}")
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(tmp_path):
     ## Serialize model
@@ -301,6 +310,7 @@ def test_openai_apiserver_with_tensorizer(tmp_path):
         completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
     with pytest.raises(ValueError):
         vllm_runner(model_ref,
diff --git a/tests/test_config.py b/tests/test_config.py
index 19db10630bbae..5b642666e7cca 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,9 @@
+import pytest
 from vllm.config import ModelConfig
+from vllm.utils import is_hpu
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_get_sliding_window():
     TEST_SLIDING_WINDOW = 4096
     # Test that the sliding window is correctly computed.
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 179e8d25a341b..536ba3dc1d6fa 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -9,6 +9,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import is_hpu
 from vllm.worker.model_runner import ModelRunner
 
 
@@ -48,13 +49,17 @@ def _prepare_test(
 
 
 RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+if is_hpu():
+    DEVICES = ["hpu"]
+else:
+    DEVICES = [
+        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+    ]
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_logits_processors(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 9bc9becb2a6f1..b3cd37a43a8ef 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -7,6 +7,7 @@
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.utils import is_hpu
 
 TRUTH = [
     "Hello here, this is a simple test",
@@ -55,6 +56,8 @@ def _run_incremental_decode(tokenizer, all_input_ids,
 @pytest.mark.parametrize("skip_special_tokens", (True, False))
 def test_decode_streaming(tokenizer_id, truth, with_prompt,
                           skip_special_tokens):
+    if is_hpu() and tokenizer_id == "meta-llama/Llama-2-7b-hf":
+        pytest.skip("Skipping test on HPU")
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     if with_prompt:
         truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
@@ -114,6 +117,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
                                        tokenizer_name: str) -> List[int]:
+    if is_hpu() and tokenizer_name == "meta-llama/Llama-2-7b-hf":
+        pytest.skip("Skipping test on HPU")
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
     return complete_sequence_token_ids
@@ -145,6 +150,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
                                   detokenizer: Detokenizer,
                                   skip_special_tokens: bool):
     """Verify Detokenizer decodes logprobs correctly."""
+    if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf":
+        pytest.skip("Skipping test on HPU")
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      logprobs=2)
 
@@ -181,6 +188,8 @@ def test_decode_prompt_logprobs(complete_sequence: str,
                                 detokenizer: Detokenizer,
                                 skip_special_tokens: bool):
     """Verify Detokenizer decodes prompt logprobs correctly."""
+    if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf":
+        pytest.skip("Skipping test on HPU")
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      prompt_logprobs=1)
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index e7975d0ef48b9..e54071c8b7dca 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -5,10 +5,11 @@
 from vllm.distributed.parallel_state import init_distributed_environment
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import get_open_port
+from vllm.utils import get_open_port, is_hpu
 from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_prompt(batch_size):
     scheduler_config = SchedulerConfig(100000,
@@ -121,6 +122,7 @@ def test_prepare_prompt(batch_size):
     torch.testing.assert_close(actual, expected)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_decode_cuda_graph(batch_size):
     model_config = ModelConfig(
@@ -212,6 +214,7 @@ def test_prepare_decode_cuda_graph(batch_size):
     torch.testing.assert_close(actual, expected)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_empty_seq_group():
     """Verify prepare prompt and decode returns empty output."""
     model_config = ModelConfig(
@@ -257,6 +260,7 @@ def distributed_init():
         local_rank=0)
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("batch_size", list(range(2, 128)))
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 4d2d3add27d59..36f9cdb1e59e1 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,11 +1,13 @@
 import torch
 
+import pytest
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port, is_hpu
 from vllm.worker.worker import Worker
 
 
+@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_swap() -> None:
     # Configure the engine.
     engine_args = EngineArgs(model="facebook/opt-125m",
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 759fdb65e08ed..4b2e5c9201d30 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -56,8 +56,8 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, i
 
 
 def swap_blocks(src, dst, block_mapping):
-    index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
-    index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    index_src = torch.zeros((1,), dtype=torch.int32, device=src.device)
+    index_dst = torch.zeros((1,), dtype=torch.int32, device=dst.device)
     for src_idx, dst_idx in block_mapping.items():
         index_src[0] = src_idx
         index_dst[0] = dst_idx
diff --git a/vllm/utils.py b/vllm/utils.py
index 6d6d3d4f4590d..19786035cb18e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -409,14 +409,18 @@ def create_kv_caches_with_random(
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
     scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    if is_hpu():
+        key_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    else:
+        x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
     key_caches = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
                                 device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+        cache_dtype = str(cache_dtype)
+        if cache_dtype in ["auto", "half", "float16", "torch.float16", "torch.bfloat16", "torch.float32"]:
             key_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
             _generate_random_fp8(key_cache, -scale, scale)
@@ -431,7 +435,7 @@ def create_kv_caches_with_random(
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
                                   device=device)
-        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
+        if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]:
             value_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
             _generate_random_fp8(value_cache, -scale, scale)

From eaa6c06c2cd354880d3bc8b3921a21fdbabc489f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 7 Jun 2024 12:02:29 +0200
Subject: [PATCH 031/341] Update ops.py (#54)

---
 vllm/hpu/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 10e53312378ad..a93508a50d7a8 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -13,7 +13,7 @@
 
 import vllm.hpu.utils as hpu_utils
 
-PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '0') == '1')
+PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
 
 
 def silu_and_mul(output, input):

From 45fb692a259909ab6f0cb6e2e015e3f2a7228f5a Mon Sep 17 00:00:00 2001
From: jkaniecki <153085639+jkaniecki@users.noreply.github.com>
Date: Mon, 10 Jun 2024 16:55:00 +0200
Subject: [PATCH 032/341] Add syncs in mixtral weight loader (#55)

* Add hpu syncs in model loader to prevent memory peak after loading weights

* Remove spaces

* Fix typo
---
 vllm/model_executor/models/mixtral.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 4b602203cee79..e725c4d7cde8a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -591,7 +591,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
-
+            
+            if is_hpu():
+                torch.hpu.synchronize()
 
 def all_close_1d(x: torch.Tensor) -> bool:
     assert len(x.shape) == 1

From 2825ddec9c69ec72b17cd6fb600afaf33ebe2103 Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Tue, 11 Jun 2024 09:24:25 +0200
Subject: [PATCH 033/341] HPU: Change KV-cache layout (#56)

* HPU: Change KV-cache layout to (num_blocks, block_size, num_heads, head_size)

* Fix UTs

* Fix UTs - part 2
---
 tests/kernels/test_attention.py         | 18 ++++++---
 tests/kernels/test_cache.py             | 20 ++++------
 vllm/attention/ops/habana_paged_attn.py |  4 +-
 vllm/hpu/cache_ops.py                   | 50 ++++---------------------
 vllm/hpu/ops.py                         | 19 +++++-----
 vllm/utils.py                           |  7 +++-
 vllm/worker/habana_model_runner.py      |  3 +-
 7 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index f17a51b0ccf78..b034fd6d8ce32 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -74,9 +74,14 @@ def ref_single_query_cached_kv_attention(
     alibi_slopes: Optional[torch.Tensor],
 ) -> None:
     num_query_heads = query.shape[1]
-    num_kv_heads = value_cache.shape[1]
-    head_size = value_cache.shape[2]
-    block_size = value_cache.shape[3]
+    if not is_hpu():
+        num_kv_heads = value_cache.shape[1]
+        head_size = value_cache.shape[2]
+        block_size = value_cache.shape[3]
+    else:
+        block_size = value_cache.shape[1]
+        num_kv_heads = value_cache.shape[2]
+        head_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
     block_tables = block_tables.cpu().tolist()
@@ -93,13 +98,16 @@ def ref_single_query_cached_kv_attention(
             block_offset = j % block_size
 
             if is_hpu():
-                k = key_cache[block_number, :, :, block_offset]
+                k = key_cache[block_number, block_offset, :, :]
             else:
                 k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
             keys.append(k)
 
-            v = value_cache[block_number, :, :, block_offset]
+            if is_hpu():
+                v = value_cache[block_number, block_offset, :, :]
+            else:
+                v = value_cache[block_number, :, :, block_offset]
             values.append(v)
         keys = torch.stack(keys, dim=0)
         values = torch.stack(values, dim=0)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index f8163deb05223..db1a8b556f47e 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -102,7 +102,6 @@ def test_copy_blocks(
     if is_hpu():
         tmp_block_mapping_dict = {}
         for src, dst in block_mapping:
-            print(src, dst, tmp_block_mapping_dict)
             if not tmp_block_mapping_dict.get(src):
                 tmp_block_mapping_dict[src] = [dst]
                 continue
@@ -191,17 +190,11 @@ def test_reshape_and_cache(
     kv_scale = 1.0
 
     # Call the reshape_and_cache kernel.
-    if is_hpu():
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                    slot_mapping.view((1, -1)), "auto", False)
-    else:
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                    slot_mapping, "auto")
+    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                slot_mapping, "auto")
 
     # Run the reference implementation.
-    if is_hpu():
-        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape)
-    else:
+    if not is_hpu():
         reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     block_indices = block_indices.cpu().tolist()
@@ -211,10 +204,13 @@ def test_reshape_and_cache(
         block_idx = block_indices[i]
         block_offset = block_offsets[i]
         if is_hpu():
-            cloned_key_cache[block_idx, :, :, block_offset] = reshaped_key[i]
+            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
         else:
             cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
-        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+        if is_hpu():
+            cloned_value_cache[block_idx, block_offset, :, :] = value[i]
+        else:
+            cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
     if kv_cache_dtype == "fp8":
         assert torch.allclose(result_key_cache,
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index bd6a58684f567..c8ed500f7af1c 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -41,7 +41,7 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return (num_blocks, num_kv_heads, head_size, block_size)
+        return (num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
     def split_kv_cache(
@@ -86,7 +86,7 @@ def forward_decode(
         alibi_slopes: Optional[torch.Tensor],
         kv_scale: float,
     ) -> torch.Tensor:
-        block_size = value_cache.shape[3]
+        block_size = value_cache.shape[1]
         return ops.paged_attention_v1(
             query,
             key_cache,
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 4b2e5c9201d30..56aafd2a4d0a9 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -10,49 +10,13 @@
 import habana_frameworks.torch as htorch
 
 
-def pad_to_full_block(data, block_size, pad_value):
-    seq_dim = 1
-    pad_shape = list(data.shape)
-    remainder = pad_shape[seq_dim] % block_size
-    if remainder == 0:
-        return data
-    pad_shape[seq_dim] = block_size - remainder
-    pad = torch.full(pad_shape, pad_value, dtype=data.dtype, device=data.device)
-    return torch.cat([data, pad], dim=seq_dim)
-
-
-def initialize_cache(data, indices, cache):
-    block_size = cache.size(-1)
-    data = data.unflatten(0, (-1, block_size)).permute(0, 2, 3, 1)
-    indices = indices.unflatten(0, (-1, block_size))[:,0]
-    cache.index_copy_(0, indices, data)
-
-
-def update_cache(data, indices, offsets, cache):
-    prev = cache.index_select(0, indices)
-    idx = offsets.view(-1, 1, 1, 1).expand(-1, data.size(1), data.size(2), -1)
-    prev.scatter_(-1, idx, data.unsqueeze(-1))
-    cache.index_copy_(0, indices, prev)
-
-
-def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt):
-    block_size = key_cache.size(-1)
-    assert slot_mapping.dim() == 2, 'This implementation requires unflattened slot_mapping!'
-
-    if is_prompt:
-        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-        batch_size, seq_length = block_indices.shape
-        key = pad_to_full_block(key.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1)
-        value = pad_to_full_block(value.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1)
-        block_indices = pad_to_full_block(block_indices, block_size, -1).flatten(0, 1)
-        initialize_cache(key, block_indices, key_cache)
-        initialize_cache(value, block_indices, value_cache)
-    else:
-        slot_mapping = slot_mapping.flatten()
-        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-        block_offsets = torch.fmod(slot_mapping, block_size)
-        update_cache(key, block_indices, block_offsets, key_cache)
-        update_cache(value, block_indices, block_offsets, value_cache)
+def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt=False):
+    block_size = key_cache.size(1)
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    offsets = torch.fmod(slot_mapping, block_size)
+    key_cache.index_put_((indices, offsets), key)
+    value_cache.index_put_((indices, offsets), value)
 
 
 def swap_blocks(src, dst, block_mapping):
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index a93508a50d7a8..d4b4c488b1bf2 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -31,23 +31,24 @@ def gelu_fast(output, input):
     raise NotImplementedError
 
 
-def fetch_from_cache(cache, blocks):
-    return [cache.index_select(0, blocks[:, i]) for i in range(blocks.size(1))]
+def fetch_from_cache(cache, blocks, permutations):
+    return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))]
 
 
 @hpu_utils.with_mark_steps
 def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None:
     seq_len = block_tables.size(1)
     batch_size, query_heads, _ = query.shape
-    _, kv_heads, _, _ = key_cache.shape
+    _, _, kv_heads, _ = key_cache.shape
     min_inf = torch.finfo(query.dtype).min
     mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device)
             .view(1, -1)
             .expand(batch_size, -1)
             .ge(context_lens.view(-1, 1))
             .view(batch_size, 1, 1, -1))
+    query.mul_(scale)
     query = query.unsqueeze(-2)
-    keys = fetch_from_cache(key_cache, block_tables)
+    keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1))
     if query_heads != kv_heads:
         query = query.unflatten(1, (kv_heads, -1))
         keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
@@ -55,24 +56,22 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block
 
     attn_weights = [torch.matmul(query, k) for k in keys]
     attn_weights = (torch.cat(attn_weights, dim=-1)
-                    .mul_(scale)
                     .masked_fill(mask, min_inf)
                     .softmax(dim=-1))
 
-    values = fetch_from_cache(value_cache, block_tables)
+    values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3))
     if PA_SPLIT_VALUE:
         attn_weights = attn_weights.split(block_size, dim=-1)
     else:
-        values = [torch.cat(values, dim=-1)]
+        values = [torch.cat(values, dim=-2)]
         attn_weights = [attn_weights]
     if query_heads != kv_heads:
         values = [v.unflatten(1, (kv_heads, 1)) for v in values]
-    attn_weights = [torch.matmul(a, v.transpose(-1, -2)).squeeze(-2) for a, v in zip(attn_weights, values)]
+    attn_weights = [torch.matmul(a, v) for a, v in zip(attn_weights, values)]
     if query_heads != kv_heads:
         attn_weights = [a.flatten(1, 2) for a in attn_weights]
     attn_weights = sum(attn_weights)
-
-    return attn_weights
+    return attn_weights.squeeze(-2)
 
 
 def rms_norm(out, hidden_states, weight, eps):
diff --git a/vllm/utils.py b/vllm/utils.py
index 19786035cb18e..e7a2cde3e0f5d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -410,7 +410,7 @@ def create_kv_caches_with_random(
 
     scale = head_size**-0.5
     if is_hpu():
-        key_cache_shape = (num_blocks, num_heads, head_size, block_size)
+        key_cache_shape = (num_blocks, block_size, num_heads, head_size)
     else:
         x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
         key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
@@ -429,7 +429,10 @@ def create_kv_caches_with_random(
                 f"Does not support key cache of type {cache_dtype}")
         key_caches.append(key_cache)
 
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    if is_hpu():
+        value_cache_shape = (num_blocks, block_size, num_heads, head_size)
+    else:
+        value_cache_shape = (num_blocks, num_heads, head_size, block_size)
     value_caches = []
     for _ in range(num_layers):
         value_cache = torch.empty(size=value_cache_shape,
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 78290fd59b10a..0eaf6dd9f0b93 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -976,8 +976,7 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
             total_batch_seq += batch_seq
         graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt)
         logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}')
-        
-        
+
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true':

From b2d0e1e3327a9951bf9eef5452d0829e7380de23 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 11 Jun 2024 12:18:14 +0200
Subject: [PATCH 034/341] Add more detailed event names to profiler (#57)

---
 vllm/worker/habana_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 0eaf6dd9f0b93..fa3c113c45bf1 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -851,14 +851,14 @@ def execute_model(
 
         htorch.core.mark_step()
         if self.is_driver_worker:
-            model_event_name = f'model_{base_event_name}_eager_bs{real_batch_size}'
+            model_event_name = f"model_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}"
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
             hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs)
 
         # Compute the logits.
-        with self.profiler.record_event('internal', 'compute_logits'):
+        with self.profiler.record_event('internal', f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'):
             sampling_metadata.selected_token_indices = None
             logits = self.model.compute_logits(hidden_states, sampling_metadata)
         htorch.core.mark_step()
@@ -868,7 +868,7 @@ def execute_model(
             return None
 
         # Sample the next token.
-        with self.profiler.record_event('internal', 'sample'):
+        with self.profiler.record_event('internal', f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'):
             output = self.model.sample(
                 logits=logits,
                 sampling_metadata=sampling_metadata,

From 47c0c5b95c4d8f52f9991a495e3d021e73ea957e Mon Sep 17 00:00:00 2001
From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com>
Date: Tue, 11 Jun 2024 14:05:07 +0200
Subject: [PATCH 035/341] Disable value splitting on G3 (#58)

---
 vllm/hpu/ops.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index d4b4c488b1bf2..1f2e07bd59ccb 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -9,11 +9,14 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import habana_frameworks.torch as htorch
+import habana_frameworks.torch.utils.experimental as htexp
 from typing import List, Optional, Tuple
 
 import vllm.hpu.utils as hpu_utils
 
-PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
+# FIXME: For some reason splitting value causes DFAs on G3. This needs to be debugged
+PA_SPLIT_VALUE_DEFAULT = '0' if (htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi3) else '1'
+PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', PA_SPLIT_VALUE_DEFAULT) == '1')
 
 
 def silu_and_mul(output, input):

From 628869caae26df9363eaf644d7e8cbfc57a633dd Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Date: Thu, 13 Jun 2024 16:17:03 +0200
Subject: [PATCH 036/341] Fix for OOM in Llama 70b (#60)

---
 vllm/worker/habana_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index fa3c113c45bf1..6a9cb6f066ea1 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -919,8 +919,10 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        seq_len = self.max_model_len // self.max_num_seqs
-        self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches)
+        max_batch_size = self.prompt_bs_bucket_cfg[-1]
+        max_seq_len = self.prompt_seq_bucket_cfg[-1]
+
+        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches)
 
     def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)

From ad890f1f51fcc87c0b04ed2ee3a1ab94eeb97baf Mon Sep 17 00:00:00 2001
From: Damian Szwichtenberg <damian.szwichtenberg@intel.com>
Date: Mon, 17 Jun 2024 13:27:18 +0200
Subject: [PATCH 037/341] Enable high-level profiler on multiple instances
 (#61)

---
 vllm/worker/profiler.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
index 2d47f4349d45a..34221d2553909 100644
--- a/vllm/worker/profiler.py
+++ b/vllm/worker/profiler.py
@@ -10,12 +10,12 @@
 from contextlib import contextmanager
 
 from vllm.logger import init_logger
+from vllm.utils import get_vllm_instance_id
 
 logger = init_logger(__name__)
 
 
 class FileWriter(threading.Thread):
-
     def __init__(self, filename, event_queue):
         super().__init__()
         self.filename = filename
@@ -48,13 +48,15 @@ def run(self):
 class Profiler:
     profiling_trace_events = queue.Queue()
     event_tid = {'counter': 1, 'external': 2, 'internal': 3}
-    filename = 'server_events.json'
+    vllm_instance_id = get_vllm_instance_id()
+    filename = f'server_events_{vllm_instance_id}.json'
     event_cache = []
 
     def __init__(self):
         self.enabled = os.getenv('VLLM_PROFILER_ENABLED',
                                  'false').lower() == 'true' and int(
                                      os.getenv('RANK', '0')) == 0
+        logger.info(f'Profiler enabled for: {self.vllm_instance_id}')
         if self.enabled:
             # initialize the trace file (JSON Array Format)
             with open(self.filename, 'w') as outfile:

From 11f047c1f122710f58929857981d5f270f4a4121 Mon Sep 17 00:00:00 2001
From: jkaniecki <153085639+jkaniecki@users.noreply.github.com>
Date: Mon, 24 Jun 2024 15:35:14 +0200
Subject: [PATCH 038/341] Add mark steps to prevent oom in static moe op (#65)

---
 vllm/hpu/ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 1f2e07bd59ccb..fa9d5ff521a6a 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -125,7 +125,6 @@ def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
     return out
 
 
-@hpu_utils.with_mark_steps
 def static_fused_moe(hidden_states, w1, w2, score, topk):
     B, D = hidden_states.shape
     num_experts = w1.shape[0]
@@ -142,6 +141,8 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
     padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
 
+    htorch.core.mark_step()
+
     for expert_idx in range(num_experts):
         padded_weight = padded_weights[expert_idx]
         current_state_static = hidden_states.reshape(-1, D)
@@ -149,5 +150,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
         w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
         current_hidden_states_static = w_output * padded_weight
         final_hidden_states += current_hidden_states_static
+        htorch.core.mark_step()
 
     return final_hidden_states.view(-1, D)

From fc6d4b4198ad20e9072780b7c87d8b862f80c180 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 24 Jun 2024 19:08:29 +0300
Subject: [PATCH 039/341] post-rebase api adjustments

---
 vllm/attention/backends/habana_attn.py  | 28 +++++----
 vllm/engine/llm_engine.py               | 11 +++-
 vllm/model_executor/custom_op.py        |  7 ++-
 vllm/model_executor/layers/layernorm.py | 36 ++++++++++++
 vllm/worker/habana_model_runner.py      | 78 ++++++++++++++++---------
 vllm/worker/habana_worker.py            |  3 +-
 6 files changed, 118 insertions(+), 45 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 017cf9c8933e5..518cbae81f465 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -3,7 +3,7 @@
 ###############################################################################
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
 import math
@@ -12,8 +12,7 @@
                                 LowerTriangularMaskWithTensorBias)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata,
-                                              AttentionMetadataPerStage)
+                                              AttentionMetadata)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -58,7 +57,7 @@ def copy_blocks(
 
 
 @dataclass
-class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMetadata):
+class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
     """Metadata for HabanaAttentionbackend.
 
     NOTE: Any python object stored here is not updated when it is
@@ -133,10 +132,13 @@ def __init__(
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[List[float]] = None,
-        sliding_window: Optional[int] = None,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
     ) -> None:
+        self.kv_cache_dtype = kv_cache_dtype
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -161,7 +163,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: AttentionMetadata[HabanaAttentionMetadata],
+        attn_metadata: HabanaAttentionMetadata,
         kv_scale: float,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -191,10 +193,11 @@ def forward(
             HabanaPagedAttention.write_to_paged_cache(key, value, key_cache,
                                                       value_cache,
                                                       attn_metadata.slot_mapping,
-                                                      attn_metadata.kv_cache_dtype,
+                                                      self.kv_cache_dtype,
                                                       attn_metadata.prefill_metadata is not None)
 
-        if prefill_meta := attn_metadata.prefill_metadata:
+        if attn_metadata.num_prefills > 0:
+            prefill_meta = attn_metadata
             # Prompt run.
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # TODO: move this outside of model
@@ -225,7 +228,8 @@ def forward(
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                 )
-        if decode_meta := attn_metadata.decode_metadata:
+        if attn_metadata.num_decode_tokens > 0:
+            decode_meta = attn_metadata
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
                 query,
@@ -233,7 +237,7 @@ def forward(
                 value_cache,
                 decode_meta.block_tables,
                 decode_meta.seq_lens_tensor,
-                attn_metadata.kv_cache_dtype,
+                self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
                 self.alibi_slopes,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3dfd38ce60b91..810e64a873647 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -348,7 +348,7 @@ def from_engine_args(
             from vllm.executor.cpu_executor import CPUExecutor
             executor_class = CPUExecutor
         elif engine_config.device_config.device_type == "hpu":
-            if engine_config.parallel_config.worker_use_ray:
+            if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
                 from vllm.executor.ray_habana_executor import RayHabanaExecutor
                 executor_class = RayHabanaExecutor
@@ -796,7 +796,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         request_outputs = self._process_model_outputs(
             output, scheduler_outputs.scheduled_seq_groups,
             scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
-
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
@@ -808,6 +807,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             # queued control plane messages, such as add/remove lora adapters.
             self.model_executor.stop_remote_worker_execution_loop()
 
+        out_prompt = [ro.prompt for ro in request_outputs]
+        out_indices =  [ro.outputs[-1].index for ro in request_outputs]
+        out_text =  [f'{ro.outputs[-1].text!r}' for ro in request_outputs]
+        for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)):
+            logger.info(f'\tPROMPT ({idx}): {p}')            
+            logger.info(f'\tGEN IDX ({idx}): {i}')            
+            logger.info(f'\tGEN TXT ({idx}): {t}')            
+            logger.info('')            
         return request_outputs
 
     def do_log_stats(
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 0db72d8d95f24..5276ada2a3086 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu
+from vllm.utils import is_cpu, is_hip, is_hpu, is_tpu, is_xpu
 
 
 class CustomOp(nn.Module):
@@ -31,6 +31,9 @@ def forward_hip(self, *args, **kwargs):
     def forward_xpu(self, *args, **kwargs):
         raise NotImplementedError
 
+    def forward_hpu(self, *args, **kwargs):
+        return self.forward_cuda(*args, **kwargs)
+
     def forward_cpu(self, *args, **kwargs):
         # By default, we assume that CPU ops are compatible with CUDA ops.
         return self.forward_cuda(*args, **kwargs)
@@ -54,6 +57,8 @@ def dispatch_forward(self):
             return self.forward_hip
         elif is_cpu():
             return self.forward_cpu
+        elif is_hpu():
+            return self.forward_hpu
         elif is_tpu():
             return self.forward_tpu
         elif is_xpu():
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d8b25ea9566e3..43015068b6685 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -73,6 +73,41 @@ def forward_cuda(
         )
         return out
 
+    def forward_hpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        if residual is not None:
+            if x.device.type == "hpu" and FusedRMSNorm:
+                orig_dtype = x.dtype
+                orig_shape = x.shape
+                residual += x.view(residual.shape)
+                # Note: FusedRMSNorm requires 3D tensors as inputs
+                x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon)
+                return x.to(orig_dtype).view(orig_shape), residual
+            ops.fused_add_rms_norm(
+                x,
+                residual,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return x, residual
+        if x.device.type == "hpu" and FusedRMSNorm:
+            orig_dtype = x.dtype
+            x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon)
+            return x.to(orig_dtype)
+        out = torch.empty_like(x)
+        ops.rms_norm(
+            out,
+            x,
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
+
     def forward_xpu(
         self,
         x: torch.Tensor,
@@ -108,6 +143,7 @@ def forward_xpu(
         )
         return out
 
+
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6a9cb6f066ea1..56eaaa490b025 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -15,12 +15,11 @@
 import torch
 import habana_frameworks.torch as htorch
 
-from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
-                            get_attn_backend)
+from vllm.attention import (AttentionMetadata, get_attn_backend)
 from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.distributed.parallel_state import get_cpu_world_group
+#from vllm.distributed.parallel_state import get_cpu_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -98,13 +97,14 @@ def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[s
 
 
 def align_workers(value, op):
-    group = get_cpu_world_group()
-    world_size = torch.distributed.get_world_size()
-    if world_size <= 1:
-        return value
-    value_t = torch.tensor(value, device='cpu')
-    torch.distributed.all_reduce(value_t, op=op, group=group)
-    return value_t.item()
+    #group = get_cpu_world_group()
+    #world_size = torch.distributed.get_world_size()
+    #if world_size <= 1:
+    #    return value
+    #value_t = torch.tensor(value, device='cpu')
+    #torch.distributed.all_reduce(value_t, op=op, group=group)
+    #return value_t.item()
+    return 0
 
 
 class HpuModelAdapter():
@@ -112,7 +112,7 @@ def __init__(self, model):
         self.model = model
 
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
-        prefill_metadata = attn_metadata.prefill_metadata
+        prefill_metadata = attn_metadata
         if prefill_metadata is None:
             return attn_metadata
         #FIXME: Restore alibi support
@@ -132,8 +132,9 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
                          .masked_fill_(mask, -math.inf))
             #FIXME: Restore sliding window support
             #if self.sliding_window is not None:
-            prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
-            attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
+            #prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
+#            attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
+            attn_metadata.attn_bias = attn_bias
             return attn_metadata
         else:
             # FIXME: This needs updating...
@@ -149,6 +150,7 @@ def forward(self, *args, **kwargs):
             kwargs.pop('bypass_hpu_graphs') # required for PT eager
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16)
+        import pdb; pdb.set_trace()
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)
@@ -164,7 +166,7 @@ def sample(self, *args, **kwargs):
 class PreparePromptMetadata(NamedTuple):
     input_tokens: List[int]
     input_positions: List[int]
-    attn_metadata: Optional[AttentionMetadataPerStage]
+    attn_metadata: Optional[AttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
     lora_index_mapping: List[int]
@@ -241,6 +243,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
         self.load_config = load_config
+        self.cache_config = cache_config
         self.is_driver_worker = is_driver_worker
         self.profiler = Profiler()
 
@@ -261,7 +264,14 @@ def __init__(
         self.vision_language_config = vision_language_config
 
         self.attn_backend = get_attn_backend(
-            self.model_config.dtype if model_config is not None else None)
+            self.model_config.get_num_attention_heads(self.parallel_config),
+            self.model_config.get_head_size(),
+            self.model_config.get_num_kv_heads(self.parallel_config),
+            self.model_config.get_sliding_window(),
+            self.model_config.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+        )
 
         # Lazy initialization
         self.lora_manager: LRUCacheWorkerLoRAManager = None
@@ -280,6 +290,7 @@ def load_model(self) -> None:
                     vision_language_config=self.vision_language_config,
                     parallel_config=self.parallel_config,
                     scheduler_config=self.scheduler_config,
+                    cache_config=self.cache_config
                 )
             logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}")
 
@@ -447,6 +458,8 @@ def _prepare_prompt(
                 slot_mapping[-1].append(slot)
 
         max_query_len = max(query_lens)
+        sum_query_len = sum(query_lens)
+        real_num_seqs = len(query_lens) 
         assert max_query_len > 0
 
         context_lens_tensor = torch.tensor(context_lens,
@@ -514,6 +527,10 @@ def _prepare_prompt(
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
+            num_prefills=real_num_seqs,
+            num_prefill_tokens=sum_query_len,
+            num_decode_tokens=0, 
+            slot_mapping=slot_mapping
         )
         return PreparePromptMetadata(
             input_tokens=input_tokens,
@@ -593,7 +610,7 @@ def _prepare_decode(
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.int,
                                        device=self.device)
-
+        num_decode_tokens = sum(seq_lens)
         max_block_table_len = max(
             len(block_table) for block_table in block_tables)
         block_tables = make_tensor_with_pad(
@@ -613,6 +630,10 @@ def _prepare_decode(
             context_lens_tensor=None,
             block_tables=block_tables,
             use_cuda_graph=False,
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=num_decode_tokens,
+            slot_mapping=slot_mapping
         )
         return PrepareDecodeMetadata(
             input_tokens=input_tokens,
@@ -772,25 +793,26 @@ def prepare_input_tensors(
                 decode_attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
 
-        attn_metadata = AttentionMetadata(
-            num_prefills=num_prefills,
-            slot_mapping=slot_mapping,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            prefill_metadata=prefill_attn_metadata,
-            decode_metadata=decode_attn_metadata,
-            kv_cache_dtype=self.kv_cache_dtype,
-        )
+        attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata
+#        attn_metadata = AttentionMetadata(
+#            num_prefills=num_prefills,
+#            slot_mapping=slot_mapping,
+#            num_prefill_tokens=num_prefill_tokens,
+#            num_decode_tokens=num_decode_tokens,
+#            prefill_metadata=prefill_attn_metadata,
+#            decode_metadata=decode_attn_metadata,
+#            kv_cache_dtype=self.kv_cache_dtype,
+#        )
 
         return (input_tokens, input_positions, attn_metadata,
                 sampling_metadata, lora_requests, lora_mapping,
                 multi_modal_input)
 
     def _seq_len(self, attn_metadata):
-        if attn_metadata.prefill_metadata:
+        if attn_metadata.num_prefills != 0:
             return attn_metadata.slot_mapping.size(1)
         else:
-            return attn_metadata.decode_metadata.block_tables.size(1) * self.block_size
+            return attn_metadata.block_tables.size(1) * self.block_size
 
     def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         prefill_metadata = subtuple(metadata.prefill_metadata,
@@ -844,7 +866,7 @@ def execute_model(
             "input_ids": input_tokens,
             "positions": input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": self.trim_attn_metadata(attn_metadata),
+            "attn_metadata": attn_metadata,
         }
         if self.vision_language_config:
             execute_model_kwargs.update({"image_input": multi_modal_input})
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index e253e4479a855..7abaa155708c6 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -15,7 +15,6 @@
                          VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
-                              get_tensor_model_parallel_cpu_group,
                               init_distributed_environment)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
@@ -159,7 +158,7 @@ def initialize_cache(self, num_gpu_blocks: int,
     def _init_cache_engine(self) -> None:
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
+                                        self.parallel_config, self.device_config)
         self.hpu_cache = self.cache_engine.gpu_cache
         htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution
 

From 07a200e60e8dc5bce66e240948cb13a6ff121ede Mon Sep 17 00:00:00 2001
From: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
Date: Mon, 24 Jun 2024 18:13:15 +0200
Subject: [PATCH 040/341] Add Mistal&Mixtral supported configurations (#64)

---
 README_GAUDI.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 24d3fe0761f54..3b72ad71069c4 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -100,6 +100,10 @@ The following configurations have been validated to be function with Gaudi devic
 -   [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
     with tensor parallelism 8x HPU, BF16 datatype with random
     or greedy sampling
+-   [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+    on single HPU or with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling
+-   [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+    with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling
 
 
From d12bff7f0d68084b9c851616933254db1ecab901 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 15:40:06 +0300
Subject: [PATCH 041/341] add pin_lora to habana components

---
 vllm/executor/habana_executor.py | 5 ++++-
 vllm/worker/habana_worker.py     | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index cfad194bf9cca..cbb30e39e11a4 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -113,7 +113,7 @@ def execute_model(
 
         output = self.driver_worker.execute_model(execute_model_req)
         return output
-
+    
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
@@ -123,6 +123,9 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> List[int]:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
+    def pin_lora(self) -> List[int]:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
     def check_health(self) -> None:
         # GPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 7abaa155708c6..1a82aa9ef7738 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -235,6 +235,9 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len

From efce3c48f3752c652d3a17504a56c7df4ed34f6a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 15:57:10 +0300
Subject: [PATCH 042/341] add WA for model loader

---
 vllm/engine/llm_engine.py                  | 16 ++++++++--------
 vllm/model_executor/model_loader/loader.py |  3 ++-
 vllm/worker/habana_model_runner.py         |  1 -
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4c76649a2d862..aadb8e08de5d4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -832,14 +832,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             # queued control plane messages, such as add/remove lora adapters.
             self.model_executor.stop_remote_worker_execution_loop()
 
-        out_prompt = [ro.prompt for ro in request_outputs]
-        out_indices =  [ro.outputs[-1].index for ro in request_outputs]
-        out_text =  [f'{ro.outputs[-1].text!r}' for ro in request_outputs]
-        for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)):
-            logger.info(f'\tPROMPT ({idx}): {p}')            
-            logger.info(f'\tGEN IDX ({idx}): {i}')            
-            logger.info(f'\tGEN TXT ({idx}): {t}')            
-            logger.info('')            
+#        out_prompt = [ro.prompt for ro in request_outputs]
+#        out_indices =  [ro.outputs[-1].index for ro in request_outputs]
+#        out_text =  [f'{ro.outputs[-1].text!r}' for ro in request_outputs]
+#        for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)):
+#            logger.info(f'\tPROMPT ({idx}): {p}')            
+#            logger.info(f'\tGEN IDX ({idx}): {i}')            
+#            logger.info(f'\tGEN TXT ({idx}): {t}')            
+#            logger.info('')            
         return request_outputs
 
     def do_log_stats(
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d3babcf9c3451..9f41b3e28e6ed 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -257,7 +257,7 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with torch.device('cpu'): # FIXME(kzawora): this is a nasty workaround!!!
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, vision_language_config,
                                           cache_config)
@@ -277,6 +277,7 @@ def load_model(self, *, model_config: ModelConfig,
                 # to use quant_method.
                 if hasattr(module, "process_weights_after_loading"):
                     module.process_weights_after_loading()
+        model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!!
         return model.eval()
 
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 56eaaa490b025..4c6b6600397b4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -150,7 +150,6 @@ def forward(self, *args, **kwargs):
             kwargs.pop('bypass_hpu_graphs') # required for PT eager
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16)
-        import pdb; pdb.set_trace()
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)

From c1e758927a4735f3c12bbe65e40a29317563305a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 17:24:43 +0300
Subject: [PATCH 043/341] fix api mismatches with ray

---
 vllm/executor/ray_habana_executor.py | 8 ++++++--
 vllm/worker/habana_model_runner.py   | 4 ----
 vllm/worker/habana_worker.py         | 3 ++-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index a17f509f11658..21d7c5ffceff2 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -31,7 +31,7 @@ def _init_executor(self) -> None:
         assert (not self.speculative_config
                 ), "Speculative decoding not yet supported for RayGPU backend."
 
-        assert self.parallel_config.worker_use_ray
+        assert self.parallel_config.distributed_executor_backend == "ray"
         placement_group = self.parallel_config.placement_group
 
         # Disable Ray usage stats collection.
@@ -146,7 +146,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
-    def execute_model(
+    def _driver_execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
         all_outputs = self._run_workers(
@@ -273,6 +273,10 @@ def _check_if_any_actor_is_dead(self):
             raise RuntimeError("At least one Worker is dead. "
                                f"Dead Workers: {dead_actors}. ")
 
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
 
 class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 4c6b6600397b4..93a44654f5375 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -758,15 +758,11 @@ def prepare_input_tensors(
             metadata_dict = broadcast_tensor_dict(src=0)
             input_tokens = metadata_dict.pop("input_tokens")
             input_positions = metadata_dict.pop("input_positions")
-            slot_mapping = metadata_dict.pop("slot_mapping")
-            num_prefills = metadata_dict.pop("num_prefills")
             selected_token_indices = metadata_dict.pop(
                 "selected_token_indices")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
             multi_modal_input = metadata_dict.pop("multi_modal_input")
-            num_prefill_tokens = metadata_dict.pop("num_prefill_tokens")
-            num_decode_tokens = metadata_dict.pop("num_decode_tokens")
             batch_type = metadata_dict.pop("batch_type")
 
             # Create an attention metadata.
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 1a82aa9ef7738..0d42304d3b47a 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -12,7 +12,7 @@
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         VisionLanguageConfig, SpeculativeConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -45,6 +45,7 @@ def __init__(
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
         vision_language_config: Optional[VisionLanguageConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config

From 58bd037c7ac570c5139aa4f9e36578955f92ac8b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 18:21:13 +0300
Subject: [PATCH 044/341] tensor parallel fixes

---
 vllm/_custom_ops.py                           | 12 +--
 vllm/executor/ray_habana_executor.py          | 82 +++++++++----------
 .../model_executor/layers/logits_processor.py |  2 +-
 vllm/worker/habana_worker.py                  | 34 ++++++++
 4 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e899c57ce0e86..57a96c9f988b5 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -46,10 +46,12 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 _ops = torch.ops._C
+_cache_ops = torch.ops._C_cache_ops
 if importlib.util.find_spec('habana_frameworks') is not None:
     from vllm.hpu import ops as vllm_ops
     from vllm.hpu import cache_ops as vllm_cache_ops
     _ops = vllm_ops
+    _cache_ops = vllm_cache_ops
 
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -369,7 +371,7 @@ def reshape_and_cache(
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
-    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
+    _cache_ops.reshape_and_cache(key, value, key_cache,
                                              value_cache, slot_mapping,
                                              kv_cache_dtype, kv_scale)
 
@@ -382,7 +384,7 @@ def reshape_and_cache_flash(
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
 ) -> None:
-    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
+    _cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                    value_cache, slot_mapping,
                                                    kv_cache_dtype)
 
@@ -390,19 +392,19 @@ def reshape_and_cache_flash(
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
-    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+    _cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
+    _cache_ops.swap_blocks(src, dst, block_mapping)
 
 
 def convert_fp8(output: torch.Tensor,
                 input: torch.Tensor,
                 scale: float = 1.0,
                 kv_dtype: str = "fp8") -> None:
-    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+    _cache_ops.convert_fp8(output, input, scale, kv_dtype)
 
 
 def get_device_attribute(attribute: int, device: int) -> int:
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 21d7c5ffceff2..b9c800e85728b 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -147,22 +147,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_parallel_loading_workers)
 
     def _driver_execute_model(
-            self,
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        all_outputs = self._run_workers(
-            "execute_model",
-            driver_kwargs={"execute_model_req": execute_model_req},
-            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
 
-        # Only the driver worker returns the sampling results.
-        return all_outputs[0]
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
 
     def _run_workers(
         self,
         method: str,
         *args,
-        driver_args: Optional[Tuple[Any, ...]] = None,
-        driver_kwargs: Optional[Dict[str, Any]] = None,
+        async_run_remote_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
@@ -173,9 +173,11 @@ def _run_workers(
         """Runs the given method on all workers. Can be used in the following
         ways:
 
+        - async_run_remote_workers_only: If True the method will be run only
+          in the remote workers, not the driver worker. It will also be
+          run asynchronously and return a list of futures rather than blocking
+          on the results.
         - args/kwargs: All workers share the same args/kwargs
-        - args/kwargs and driver_args/driver_kwargs: Driver worker has
-          different args
         - all_args/all_kwargs: args/kwargs for each worker are specified
           individually
         """
@@ -184,11 +186,6 @@ def _run_workers(
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        if driver_args is None:
-            driver_args = args if all_args is None else all_args[0]
-        if driver_kwargs is None:
-            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
         count = len(self.workers)
         all_worker_args = repeat(args, count) if all_args is None \
             else islice(all_args, 1, None)
@@ -200,6 +197,7 @@ def _run_workers(
             # input. TODO(sang): Fix it.
             assert self.forward_dag is not None
             output_channels = self.forward_dag.execute(1)
+            ray_worker_outputs = []
         else:
             # Start the ray workers first.
             ray_worker_outputs = [
@@ -209,6 +207,13 @@ def _run_workers(
                      ) in zip(self.workers, all_worker_args, all_worker_kwargs)
             ]
 
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
         # Start the driver worker after all the ray workers.
         if not use_dummy_driver:
             driver_worker_output = self.driver_worker.execute_method(
@@ -235,6 +240,11 @@ def _run_workers(
 
         return [driver_worker_output] + ray_worker_outputs
 
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+
     def _compiled_ray_dag(self):
         import pkg_resources
         required_version = "2.9"
@@ -282,30 +292,18 @@ class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.driver_executor = make_async(self.driver_worker.execute_method)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
 
-    async def _run_workers_async(
+    async def _driver_execute_model_async(
         self,
-        method: str,
-        *args,
-        driver_args: Optional[Tuple[Any, ...]] = None,
-        driver_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers."""
-        coros = []
-
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
-        coros.append(
-            self.driver_executor(method, *driver_args, **driver_kwargs))
-
-        # Run the ray workers asynchronously.
-        for worker in self.workers:
-            coros.append(worker.execute_method.remote(method, *args, **kwargs))
-
-        all_outputs = await asyncio.gather(*coros)
-        return all_outputs
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 607f93d8b335d..de0f9d36dce87 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -54,7 +54,7 @@ def forward(
         # NOTE(kzawora): allgather on HPU will cause logits to be not None, 
         # and we need to guard against applying logits processors on non-driver worker
         #if logits is not None and sampling_metadata.seq_groups is not None:
-        if logits is not None:
+        if logits is not None and sampling_metadata.seq_groups is not None:
             if self.scale != 1.0:
                 logits *= self.scale
 
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 0d42304d3b47a..e1c374124633f 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -227,6 +227,40 @@ def execute_model(
                                                  self.hpu_cache)
         return [output]
 
+    @torch.inference_mode()
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
+
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        while self._execute_model_non_driver():
+            pass
+
+    def _execute_model_non_driver(self) -> bool:
+        """Execute model in parallel worker.
+
+        Returns True iff there are remaining sequences to process.
+        """
+        assert not self.is_driver_worker
+        data = broadcast_tensor_dict(src=0)
+        if not data:
+            return False
+
+        num_seq_groups = data.get("num_seq_groups", 0)
+        blocks_to_swap_in = data.get("blocks_to_swap_in")
+        blocks_to_swap_out = data.get("blocks_to_swap_out")
+        blocks_to_copy = data.get("blocks_to_copy")
+        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return False
+
+        self.model_runner.execute_model(None, self.hpu_cache)
+        return True
+
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 

From 1d6409b0e327cb24abaf044b7367323e8d7b3309 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 18:41:33 +0300
Subject: [PATCH 045/341] workers cpu alignment fix

---
 vllm/worker/habana_model_runner.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 93a44654f5375..2c243ade40a23 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -19,7 +19,7 @@
 from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-#from vllm.distributed.parallel_state import get_cpu_world_group
+from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -97,14 +97,13 @@ def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[s
 
 
 def align_workers(value, op):
-    #group = get_cpu_world_group()
-    #world_size = torch.distributed.get_world_size()
-    #if world_size <= 1:
-    #    return value
-    #value_t = torch.tensor(value, device='cpu')
-    #torch.distributed.all_reduce(value_t, op=op, group=group)
-    #return value_t.item()
-    return 0
+    group = get_world_group().cpu_group
+    world_size = torch.distributed.get_world_size()
+    if world_size <= 1:
+        return value
+    value_t = torch.tensor(value, device='cpu')
+    torch.distributed.all_reduce(value_t, op=op, group=group)
+    return value_t.item()
 
 
 class HpuModelAdapter():

From 952b7c4ffa5a208b8f82de701f8d4a4c12913ec5 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 20:11:05 +0300
Subject: [PATCH 046/341] prefill/decode metadata fixes

---
 vllm/attention/backends/habana_attn.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 518cbae81f465..7d5fb5146cc2f 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -194,21 +194,20 @@ def forward(
                                                       value_cache,
                                                       attn_metadata.slot_mapping,
                                                       self.kv_cache_dtype,
-                                                      attn_metadata.prefill_metadata is not None)
+                                                      attn_metadata.num_prefills > 0)
 
         if attn_metadata.num_prefills > 0:
-            prefill_meta = attn_metadata
             # Prompt run.
-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+            if kv_cache is None or attn_metadata.block_tables.numel() == 0:
                 # TODO: move this outside of model
-                assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
+                assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
                 query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
                 kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
                 out = xops.prompt_attention(
                     query.view(query_shape),
                     key.view(kv_shape),
                     value.view(kv_shape),
-                    attn_bias=prefill_meta.attn_bias,
+                    attn_bias=attn_metadata.attn_bias,
                     p=0.0,
                     scale=self.scale,
                 )
@@ -221,22 +220,21 @@ def forward(
                     value,
                     key_cache,
                     value_cache,
-                    prefill_meta.block_tables,
-                    prefill_meta.subquery_start_loc,
-                    prefill_meta.seq_lens_tensor,
-                    prefill_meta.context_lens_tensor,
-                    prefill_meta.max_query_len,
+                    attn_metadata.block_tables,
+                    attn_metadata.subquery_start_loc,
+                    attn_metadata.seq_lens_tensor,
+                    attn_metadata.context_lens_tensor,
+                    attn_metadata.max_query_len,
                     self.alibi_slopes,
                 )
         if attn_metadata.num_decode_tokens > 0:
-            decode_meta = attn_metadata
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
                 query,
                 key_cache,
                 value_cache,
-                decode_meta.block_tables,
-                decode_meta.seq_lens_tensor,
+                attn_metadata.block_tables,
+                attn_metadata.seq_lens_tensor,
                 self.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,

From cf04c81db857d73028d4959d3053229d7015467d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 20:25:00 +0300
Subject: [PATCH 047/341] re-enable attn metadata trimming

---
 vllm/worker/habana_model_runner.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 2c243ade40a23..da6dfb847a6ed 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -131,9 +131,7 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
                          .masked_fill_(mask, -math.inf))
             #FIXME: Restore sliding window support
             #if self.sliding_window is not None:
-            #prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
-#            attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
-            attn_metadata.attn_bias = attn_bias
+            attn_metadata = attn_metadata._replace(attn_bias=attn_bias)
             return attn_metadata
         else:
             # FIXME: This needs updating...
@@ -809,22 +807,15 @@ def _seq_len(self, attn_metadata):
             return attn_metadata.block_tables.size(1) * self.block_size
 
     def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
-        prefill_metadata = subtuple(metadata.prefill_metadata,
-                                    'TrimmedPrefillMetadata',
+        prefill_metadata = subtuple(metadata,
+                                    'TrimmedAttentionMetadata',
                                     ['block_tables',
                                      'seq_lens_tensor',
-                                     'attn_bias'])
-        decode_metadata = subtuple(metadata.decode_metadata,
-                                   'TrimmedDecodeMetadata',
-                                   ['block_tables',
-                                    'seq_lens_tensor',
-                                    ])
-        return subtuple(metadata,
-                        'TrimmedMetadata',
-                        ['slot_mapping',
-                         'kv_cache_dtype'],
-                        {'prefill_metadata': prefill_metadata,
-                         'decode_metadata': decode_metadata})
+                                     'attn_bias',
+                                     'num_prefills',
+                                     'num_decode_tokens',
+                                     'slot_mapping'])
+        return prefill_metadata
 
     @torch.inference_mode()
     def execute_model(
@@ -860,7 +851,7 @@ def execute_model(
             "input_ids": input_tokens,
             "positions": input_positions,
             "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
+            "attn_metadata": self.trim_attn_metadata(attn_metadata),
         }
         if self.vision_language_config:
             execute_model_kwargs.update({"image_input": multi_modal_input})

From 2b850fe749946c132524eba90d2a63995b22b52e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 25 Jun 2024 20:47:34 +0300
Subject: [PATCH 048/341] worker_use_ray fix

---
 vllm/engine/async_llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 84d8b7913b4e0..a35820d36d322 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -390,7 +390,7 @@ def from_engine_args(
             from vllm.executor.cpu_executor import CPUExecutorAsync
             executor_class = CPUExecutorAsync
         elif engine_config.device_config.device_type == "hpu":
-            if engine_config.parallel_config.worker_use_ray or engine_args.engine_use_ray:
+            if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
                 from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync
                 executor_class = RayHabanaExecutorAsync

From 27285993ca35682d72275011c0c3308a2fa62961 Mon Sep 17 00:00:00 2001
From: jkaniecki <153085639+jkaniecki@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:27:10 +0200
Subject: [PATCH 049/341] Update ops.py (#72)

---
 vllm/hpu/ops.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index fa9d5ff521a6a..54dd4332902a3 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -130,6 +130,7 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     num_experts = w1.shape[0]
     routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
     routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
     routing_weights = routing_weights.to(hidden_states.dtype)
     final_hidden_states = torch.zeros(
             (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device

From 4a45bbfd28417da67acbea4e5ac7eb5d673be7a8 Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Date: Thu, 27 Jun 2024 14:05:21 +0200
Subject: [PATCH 050/341] Revert "Disable value splitting on G3 (#58)" (#74)

This reverts commit 47c0c5b95c4d8f52f9991a495e3d021e73ea957e.
---
 vllm/hpu/ops.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 54dd4332902a3..b66f6709977c8 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -9,14 +9,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import habana_frameworks.torch as htorch
-import habana_frameworks.torch.utils.experimental as htexp
 from typing import List, Optional, Tuple
 
 import vllm.hpu.utils as hpu_utils
 
-# FIXME: For some reason splitting value causes DFAs on G3. This needs to be debugged
-PA_SPLIT_VALUE_DEFAULT = '0' if (htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi3) else '1'
-PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', PA_SPLIT_VALUE_DEFAULT) == '1')
+PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
 
 
 def silu_and_mul(output, input):

From 1fd06cc516f218b59e0342bf7b29f60e3e1f3149 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 27 Jun 2024 17:28:34 +0300
Subject: [PATCH 051/341] add collective crash WA

---
 vllm/distributed/communication_op.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 32394a07b00b9..eb3046105f820 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -5,9 +5,14 @@
 
 from .parallel_state import get_tp_group
 
+from vllm.utils import is_hpu
+if is_hpu():
+    import habana_frameworks.torch as htorch
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group."""
+    if is_hpu():
+        htorch.core.mark_step()
     return get_tp_group().all_reduce(input_)
 
 
From 940f5250de17711a6a903f3fbd8695a85530c077 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 27 Jun 2024 17:34:15 +0300
Subject: [PATCH 052/341] add comment to the weird mark_step

---
 vllm/distributed/communication_op.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index eb3046105f820..233be75b47f5a 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -12,6 +12,9 @@
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group."""
     if is_hpu():
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occuring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
         htorch.core.mark_step()
     return get_tp_group().all_reduce(input_)
 

From 20eafe9fae7d55ecf5a4802b1c4480158e18f60f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 28 Jun 2024 11:15:03 +0200
Subject: [PATCH 053/341] Add more metrics to high level profiler (#63)

* Add more detailed event names to profiler

* Add more profiler stats

* separate prompt and decode batch utilization

* Add more metrics

* revert engine/metrics.py changes

* un-singletonify (what a funny word) habana profiler

* formatting

* add batch block utilization metric

* fix division by zero

* fix batch_block_utilization formula

* minor refactors
---
 vllm/worker/habana_model_runner.py | 82 ++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6a9cb6f066ea1..1a9206a314d5c 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -241,6 +241,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.lora_config = lora_config
         self.load_config = load_config
+        self.cache_config = cache_config
         self.is_driver_worker = is_driver_worker
         self.profiler = Profiler()
 
@@ -267,6 +268,9 @@ def __init__(
         self.lora_manager: LRUCacheWorkerLoRAManager = None
         self.model: torch.nn.Module = None
 
+        # Profiler stats
+        self.profiler_counter_helper = HabanaProfilerCounterHelper()
+
         self._setup_buckets()
 
     def load_model(self) -> None:
@@ -876,19 +880,18 @@ def execute_model(
         output.outputs = output.outputs[:real_batch_size]
         htorch.core.mark_step()
 
-        if self.is_driver_worker:
+        if self.is_driver_worker and self.profiler.enabled:
             # Stop recording 'execute_model' event
             self.profiler.end()
             event_end = self.profiler.get_timestamp_us()
-            duration = event_end - event_start
-            throughput = batch_size_padded / (duration / 1e6)
-            throughput_effective = real_batch_size / (duration / 1e6)
-            counters = {
-                'batch_size': batch_size_padded,
-                'batch_size_effective': real_batch_size,
-                'throughput': throughput,
-                'throughput_effective': throughput_effective
-            }
+            counters = self.profiler_counter_helper.get_counter_dict(
+                cache_config=self.cache_config, 
+                duration=event_end-event_start, 
+                seq_len=seq_len, 
+                batch_size_padded=batch_size_padded, 
+                real_batch_size=real_batch_size, 
+                seq_group_metadata_list=seq_group_metadata_list, 
+                is_prompt=is_prompt)
             self.profiler.record_counter(event_start, counters)
 
         return output
@@ -1014,3 +1017,62 @@ def vocab_size(self) -> int:
 
 def _maybe_wrap_in_hpu_graph(model):
     return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)
+
+
+class HabanaProfilerCounterHelper():
+    def __init__(self):
+        self.niter = 0
+        self.average_real_throughput = None
+        self.logged_once = False
+    
+    def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, real_batch_size, seq_group_metadata_list, is_prompt):
+        throughput = batch_size_padded / (duration / 1e6)
+        throughput_effective = real_batch_size / (duration / 1e6)
+        real_seq_lens = [len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
+        real_max_seq_len = max(real_seq_lens)
+        real_num_tokens = sum(real_seq_lens)
+        padded_num_tokens = batch_size_padded * seq_len
+        batch_token_utilization = real_num_tokens / padded_num_tokens
+        if self.average_real_throughput is None:
+            self.average_real_throughput = throughput_effective
+        else: # https://www.heikohoffmann.de/htmlthesis/node134.html
+            self.average_real_throughput = self.average_real_throughput + 1/(self.niter+1) * (throughput_effective-self.average_real_throughput)
+        phase = "prompt" if is_prompt else "decode"
+        counters = {
+            f'{phase}_bucket_batch_size': batch_size_padded,
+            f'{phase}_batch_size': real_batch_size,
+            f'{phase}_bucket_seq_len': seq_len,
+            f'{phase}_seq_len': real_max_seq_len,
+            f'{phase}_bucket_gen_throughput': throughput,
+            f'{phase}_real_gen_throughput': throughput_effective,
+            f'{phase}_batch_token_utilization': batch_token_utilization,
+            'average_real_throughput': self.average_real_throughput,
+            'engine_iteration': self.niter,
+        }
+        self.niter += 1 
+        if is_prompt:
+            prompt_seq_lens = [len(seq_data.prompt_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
+            prompt_bucket_in_throughput = (seq_len*batch_size_padded) / (duration / 1e6) 
+            prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) 
+            counters[f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
+            counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
+
+        # KV cache might not be created yet (e.g. for profiling run)
+        if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0:
+            cache_num_blocks_used = [math.ceil(sl/cache_config.block_size) for sl in real_seq_lens]
+            cache_total_num_blocks_used = sum(cache_num_blocks_used)
+            num_cache_blocks = cache_config.num_gpu_blocks 
+            cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used
+            cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks
+            max_blocks_per_seq = math.ceil(seq_len/cache_config.block_size)
+            batch_block_utilization = cache_total_num_blocks_used / (batch_size_padded * max_blocks_per_seq)
+            counters['cache_num_blocks_used'] = cache_total_num_blocks_used
+            counters['cache_num_free_blocks'] = cache_total_num_free_blocks
+            counters['cache_computed_utilization'] = cache_computed_utilization
+            counters[f'{phase}_batch_block_utilization'] = batch_block_utilization
+        if not self.logged_once:
+            counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
+            counters['const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization
+            counters['const_block_size'] = cache_config.block_size
+            self.logged_once = True
+        return counters

From a3ac366a2cc9b47d5f167573b4f4baa3f8424c04 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 13:25:09 +0300
Subject: [PATCH 054/341] Revert test changes

---
 tests/async_engine/test_api_server.py         |   2 -
 tests/async_engine/test_openapi_server_ray.py |   3 -
 .../test_basic_correctness.py                 |   3 +-
 .../basic_correctness/test_chunked_prefill.py |   2 -
 tests/basic_correctness/test_preemption.py    |   6 -
 tests/core/block/e2e/test_correctness.py      |   7 -
 tests/core/test_chunked_prefill_scheduler.py  |   7 -
 tests/core/test_scheduler.py                  |   8 -
 tests/distributed/test_pynccl.py              |  20 +-
 tests/engine/test_computed_prefix_blocks.py   |   2 -
 tests/engine/test_skip_tokenizer_init.py      |   2 -
 tests/engine/test_stop_reason.py              |   2 -
 tests/engine/test_stop_strings.py             |   3 -
 tests/entrypoints/openai/test_chat.py         |   2 -
 .../openai/test_oot_registration.py           |   3 +-
 tests/kernels/test_activation.py              |  25 +--
 tests/kernels/test_attention.py               |  87 +++------
 tests/kernels/test_cache.py                   | 176 +++---------------
 tests/kernels/test_layernorm.py               |  18 +-
 tests/kernels/test_moe.py                     |   3 -
 tests/kernels/test_pos_encoding.py            |  19 +-
 tests/kernels/test_prefix_prefill.py          |  10 +-
 tests/kernels/test_rand.py                    |   2 -
 tests/kernels/test_sampler.py                 |   4 -
 tests/lora/test_baichuan.py                   |   3 -
 tests/lora/test_chatglm3.py                   |   3 -
 tests/lora/test_gemma.py                      |   3 -
 tests/lora/test_layer_variation.py            |   2 -
 tests/lora/test_layers.py                     |   6 -
 tests/lora/test_llama.py                      |   6 +-
 tests/lora/test_lora.py                       |   4 -
 tests/lora/test_lora_manager.py               |   9 -
 tests/lora/test_punica.py                     |   4 -
 tests/lora/test_quant_model.py                |   2 -
 tests/lora/test_worker.py                     |   4 -
 tests/metrics/test_metrics.py                 |   4 -
 tests/models/test_big_models.py               |   4 +-
 tests/models/test_llava.py                    |   1 -
 tests/models/test_mistral.py                  |   2 -
 tests/models/test_models.py                   |   3 -
 tests/models/test_oot_registration.py         |   1 -
 tests/quantization/test_configs.py            |   5 +-
 tests/samplers/test_beam_search.py            |   2 -
 tests/samplers/test_logits_processor.py       |   2 -
 tests/samplers/test_logprobs.py               |   2 -
 tests/samplers/test_ranks.py                  |   2 -
 tests/samplers/test_rejection_sampler.py      |  18 +-
 tests/samplers/test_sampler.py                |  30 ++-
 tests/samplers/test_seeded_generate.py        |   2 -
 tests/spec_decode/e2e/test_compatibility.py   |   4 -
 tests/spec_decode/e2e/test_logprobs.py        |   6 -
 .../e2e/test_multistep_correctness.py         |  12 --
 .../spec_decode/e2e/test_ngram_correctness.py |   5 -
 tests/spec_decode/test_batch_expansion.py     |   2 -
 tests/spec_decode/test_metrics.py             |   6 -
 tests/spec_decode/test_multi_step_worker.py   |   6 -
 tests/spec_decode/test_ngram_worker.py        |   5 -
 tests/spec_decode/test_spec_decode_worker.py  |   9 +-
 tests/tensorizer_loader/test_tensorizer.py    |  10 +-
 tests/test_config.py                          |   3 +-
 tests/test_logits_processor.py                |  13 +-
 tests/tokenization/test_detokenize.py         |   9 -
 tests/worker/test_model_runner.py             |   7 +-
 tests/worker/test_swap.py                     |   4 +-
 64 files changed, 107 insertions(+), 534 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 8b0e79cf9a6ee..7f57d5cf9b182 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -6,7 +6,6 @@
 
 import pytest
 import requests
-from vllm.utils import is_hpu
 
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
@@ -45,7 +44,6 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
     uvicorn_process.terminate()
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
 @pytest.mark.parametrize("worker_use_ray", [False, True])
 @pytest.mark.parametrize("engine_use_ray", [False, True])
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 50fb22901f957..332937b874e93 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -3,7 +3,6 @@
 # using Ray for overall ease of process management, parallel requests,
 # and debugging.
 import ray
-from vllm.utils import is_hpu
 
 from ..utils import RemoteOpenAIServer
 
@@ -20,8 +19,6 @@ def ray_ctx():
 
 @pytest.fixture(scope="module")
 def server(ray_ctx):
-    if is_hpu():
-        pytest.skip("Skipping test on HPU")
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 72f36b0df98ac..a7b0fef533ccb 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -5,7 +5,6 @@
 import weakref
 
 import pytest
-from vllm.utils import is_hpu
 
 from vllm import LLM
 
@@ -26,7 +25,7 @@ def test_vllm_gc_ed():
     # because llm instance is not GC'ed.
     assert weak_llm() is None
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index d559537baa9be..767e0628765bd 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,7 +7,6 @@
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import pytest
-from vllm.utils import is_hpu
 
 from ..models.utils import check_outputs_equal
 
@@ -17,7 +16,6 @@
 ]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 30d9e3b36fedf..d60cc95d75433 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -11,7 +11,6 @@
 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                  ENABLE_ARTIFICIAL_PREEMPT)
-from vllm.utils import is_hpu
 
 from ..models.utils import check_outputs_equal
 
@@ -25,7 +24,6 @@
     "tests/basic_correctness/test_preemption.py`")
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -70,7 +68,6 @@ def test_chunked_prefill_recompute(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -122,7 +119,6 @@ def test_preemption(
     assert total_preemption == total_recorded_preemption
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -181,7 +177,6 @@ def test_swap(
     assert total_preemption == total_recorded_preemption
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -226,7 +221,6 @@ def test_swap_infeasible(
     assert req_outputs[0].outputs[0].finish_reason == "length"
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 0d728f6b10047..8502eab0f8da0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -3,12 +3,10 @@
 import pytest
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 from .conftest import get_token_ids_from_llm_generator
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -87,7 +85,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -155,7 +152,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -255,7 +251,6 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [
@@ -326,7 +321,6 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -408,7 +402,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
     assert baseline_token_ids == test_token_ids
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 9347472a64a3a..a3b76327e0a53 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -7,7 +7,6 @@
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import Logprob, SequenceGroup
-from vllm.utils import is_hpu
 
 from .utils import create_dummy_prompt
 
@@ -28,7 +27,6 @@ def schedule_and_update_computed_tokens(scheduler):
     return metas, out
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_simple():
     """Verify basic scheduling works."""
     block_size = 4
@@ -71,7 +69,6 @@ def test_simple():
     assert len(seq_group_meta) == num_seq_group
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chunk():
     """Verify prefills are chunked properly."""
     block_size = 4
@@ -116,7 +113,6 @@ def test_chunk():
     assert out.num_batched_tokens == 57
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_complex():
     block_size = 4
     max_seqs = 60
@@ -180,7 +176,6 @@ def test_complex():
     assert running[2].is_prefill()
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_maximal_decoding():
     """Verify decoding requests are prioritized."""
     block_size = 4
@@ -374,7 +369,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert out.blocks_to_swap_out == []
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_running_prefill_prioritized_over_swap():
     block_size = 4
     max_seqs = 30
@@ -523,7 +517,6 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chunked_prefill_max_seqs():
     block_size = 4
     max_seqs = 2
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 3377d735f21b7..bae958211cb7b 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -11,7 +11,6 @@
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob, SequenceGroup, SequenceStatus
-from vllm.utils import is_hpu
 
 from .utils import create_dummy_prompt
 
@@ -78,7 +77,6 @@ def test_scheduler_abort_seq_group():
     assert scheduler.get_num_unfinished_seq_groups() == 0
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_schedule_simple():
     block_size = 4
     num_seq_group = 4
@@ -146,7 +144,6 @@ def test_scheduler_prefill_prioritized():
     assert get_sequence_groups(out) == [seq_group_b]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_schedule_preempt_abort():
     block_size = 4
     max_model_len = 16
@@ -196,7 +193,6 @@ def test_scheduler_schedule_preempt_abort():
     assert scheduler.get_num_unfinished_seq_groups() == 1
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_max_seqs():
     block_size = 4
     num_seq_group = 4
@@ -238,7 +234,6 @@ def test_scheduler_max_seqs():
     assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_scheduler_delay_factor():
     block_size = 4
     scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5)
@@ -276,7 +271,6 @@ def test_scheduler_delay_factor():
     append_new_token(out, 1)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_swapped_out_prioritized():
     scheduler = initialize_scheduler(max_num_seqs=6)
     # best_of=2 * 3 == 6 sequences.
@@ -578,7 +572,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert output.blocks_to_copy == []
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_decode_swap_beam_search():
     """
     Test best_of > 1 swap out blocks
@@ -629,7 +622,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     assert output.blocks_to_copy == []
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_schedule_decode_blocks_to_copy_update():
     """
     Verify blocks_to_copy is updated.
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index cb404fef15797..e0e424439e3a5 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -8,13 +8,13 @@
 
 from vllm.distributed.communication_op import (  # noqa
     tensor_model_parallel_all_reduce)
-from vllm.utils import update_environment_variables, is_hpu
-if not is_hpu():
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
-    from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                                get_world_group, graph_capture,
-                                                init_distributed_environment)
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_world_group, graph_capture,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
@@ -65,7 +65,6 @@ def worker_fn():
     assert result == pynccl_comm.world_size
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 def test_pynccl():
@@ -95,7 +94,6 @@ def multiple_allreduce_worker_fn():
             assert result == 2
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_allreduce():
@@ -122,7 +120,6 @@ def multiple_allreduce_with_vllm_worker_fn():
             assert result == 2
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_allreduce_with_vllm():
@@ -153,7 +150,6 @@ def worker_fn_with_cudagraph():
         assert a.mean().cpu().item() == pynccl_comm.world_size**1
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 def test_pynccl_with_cudagraph():
@@ -224,7 +220,7 @@ def multiple_send_recv_worker_fn():
     else:
         assert result == 2
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="Need at least 4 GPUs to run the test.")
 def test_pynccl_multiple_send_recv():
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index ec64cdd9749ff..ed35212cc3f11 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -3,10 +3,8 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_hpu
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 56faa15d14c3d..338b208723ba9 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,10 +2,8 @@
 
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_hpu
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index d87ff3e39b3e3..b0bd6c4aa95d3 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -10,7 +10,6 @@
 import transformers
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 MODEL = "facebook/opt-350m"
 STOP_STR = "."
@@ -24,7 +23,6 @@ def vllm_model(vllm_runner):
         yield vllm_model
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_stop_reason(vllm_model, example_prompts):
     tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
     stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 18afc02b88ba3..1584b85aeb064 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm import CompletionOutput, LLMEngine, SamplingParams
-from vllm.utils import is_hpu
 
 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
@@ -11,8 +10,6 @@
 
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    if is_hpu():
-        pytest.skip("Skipping test on HPU")
     with vllm_runner(MODEL) as vllm_model:
         yield vllm_model
 
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 639f4d3fd6361..f4c0af1adfdf9 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -13,8 +13,6 @@
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_hpu
 
 from ...utils import RemoteOpenAIServer
 
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index fea991be6b913..dbbda6de1fa09 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -7,7 +7,7 @@
 from vllm import ModelRegistry
 from vllm.model_executor.models.opt import OPTForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.utils import get_open_port, is_hpu
+from vllm.utils import get_open_port
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
@@ -31,7 +31,6 @@ def server_function(port):
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_oot_registration_for_api_server():
     port = get_open_port()
     ctx = torch.multiprocessing.get_context()
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index f5da4f55d9231..a4b9f91c7688b 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -5,7 +5,6 @@
 
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
                                                    NewGELU, SiluAndMul)
-from vllm.utils import is_hpu
 
 from .allclose_default import get_default_atol, get_default_rtol
 
@@ -13,12 +12,9 @@
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
@@ -26,7 +22,7 @@
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_act_and_mul(
     activation: str,
@@ -36,15 +32,9 @@ def test_act_and_mul(
     seed: int,
     device: str,
 ) -> None:
-
-    if is_hpu() and activation != "silu":
-        pytest.skip("Only SiluAndMul supported on HPU.")
-
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
     if activation == "silu":
@@ -65,7 +55,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_activation(
     activation: Type[torch.nn.Module],
@@ -75,14 +65,9 @@ def test_activation(
     seed: int,
     device: str,
 ) -> None:
-    if is_hpu():
-        pytest.skip("GELU not supported on HPU.")
-
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
     layer = activation()
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 5d141fb111407..f848ad51c7014 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -3,23 +3,18 @@
 
 import pytest
 import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
-from vllm.utils import get_max_shared_memory_bytes, is_hip, is_hpu
-if is_hpu():
-    from vllm.hpu import ops, cache_ops
-    from vllm.hpu import xops
-    from vllm.hpu.attn_bias import BlockDiagonalCausalMask
-else:
-    from vllm._C import ops, cache_ops
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+from vllm import _custom_ops as ops
+from vllm.utils import get_max_shared_memory_bytes, is_hip
 
 from .allclose_default import get_default_atol, get_default_rtol
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
-MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 if not is_hpu() else 128
+MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
 # There may not be enough gpu memory due to large NUM_BLOCKS.
 # Reduce NUM_BLOCKS when it happens.
 NUM_BLOCKS = 4321  # Arbitrary values for testing
@@ -40,12 +35,9 @@
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 def ref_masked_attention(
@@ -75,14 +67,9 @@ def ref_single_query_cached_kv_attention(
     alibi_slopes: Optional[torch.Tensor],
 ) -> None:
     num_query_heads = query.shape[1]
-    if not is_hpu():
-        num_kv_heads = value_cache.shape[1]
-        head_size = value_cache.shape[2]
-        block_size = value_cache.shape[3]
-    else:
-        block_size = value_cache.shape[1]
-        num_kv_heads = value_cache.shape[2]
-        head_size = value_cache.shape[3]
+    num_kv_heads = value_cache.shape[1]
+    head_size = value_cache.shape[2]
+    block_size = value_cache.shape[3]
     num_seqs = query.shape[0]
 
     block_tables_lst = block_tables.cpu().tolist()
@@ -98,18 +85,12 @@ def ref_single_query_cached_kv_attention(
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
-            if is_hpu():
-                k = key_cache[block_number, block_offset, :, :]
-            else:
-                k = key_cache[block_number, :, :, block_offset, :]
+            k = key_cache[block_number, :, :, block_offset, :]
             k = k.reshape(num_kv_heads, head_size)
             keys_lst.append(k)
 
-            if is_hpu():
-                v = value_cache[block_number, block_offset, :, :]
-            else:
-                v = value_cache[block_number, :, :, block_offset]
-            values.append(v)
+            v = value_cache[block_number, :, :, block_offset]
+            values_lst.append(v)
         keys = torch.stack(keys_lst, dim=0)
         values = torch.stack(values_lst, dim=0)
         if num_queries_per_kv > 1:
@@ -139,7 +120,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_paged_attention(
     kv_cache_factory,
     version: str,
@@ -153,24 +134,14 @@ def test_paged_attention(
     seed: int,
     device: str,
 ) -> None:
-    if is_hpu():
-        if version != "v1":
-            pytest.skip("Paged attention v2 not supported on HPU")
-        if kv_cache_dtype != "auto":
-            pytest.skip("Only auto kv_cache_dtype supported on HPU")
-        if use_alibi:
-            pytest.skip("Alibi not supported on HPU")
-
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     scale = float(1.0 / (head_size**0.5))
     num_query_heads, num_kv_heads = num_heads
-    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype, device=device)
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
     query.uniform_(-scale, scale)
 
     assert num_query_heads % num_kv_heads == 0
@@ -193,7 +164,8 @@ def test_paged_attention(
             for _ in range(max_num_blocks_per_seq)
         ]
         block_tables_lst.append(block_table)
-    block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device)
+
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
 
     # Create the KV caches.
     key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
@@ -207,21 +179,7 @@ def test_paged_attention(
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
-
-    if is_hpu():
-        output = ops.paged_attention_v1(
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            seq_lens,
-            block_size,
-            alibi_slopes,
-            kv_cache_dtype,
-        )
-    elif version == "v1":
+    if version == "v1":
         ops.paged_attention_v1(
             output,
             query,
@@ -351,13 +309,12 @@ def ref_multi_query_kv_attention(
 
 
 # TODO(woosuk): Add tests for USE_ALIBI=True.
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
@@ -384,8 +341,7 @@ def test_multi_query_kv_attention(
     qkv = torch.empty(num_tokens,
                       num_query_heads + 2 * num_kv_heads,
                       head_size,
-                      dtype=dtype,
-                      device=device)
+                      dtype=dtype)
     qkv.uniform_(-scale, scale)
     query, key, value = qkv.split(
         [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
@@ -419,5 +375,4 @@ def test_multi_query_kv_attention(
     )
     atol = get_default_atol(output) if is_hip() else 1e-3
     rtol = get_default_rtol(output) if is_hip() else 1e-5
-
     assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 83fa7e47bcfac..23b6baa60c05b 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,13 +1,12 @@
 import random
 from typing import List, Tuple
 
-import math
 import pytest
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.utils import is_hpu
 
+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
@@ -21,14 +20,11 @@
 
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-if is_hpu():
-    COPYING_DIRECTION = [('hpu', 'cpu'), ('hpu', 'hpu'), ('cpu', 'hpu')]
-    DEVICES = ["hpu"]
-else:
-    COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+# We assume fp8 is always enabled for testing.
 KV_CACHE_DTYPE = ["auto", "fp8"]
 
 
@@ -40,8 +36,8 @@
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_copy_blocks(
     kv_cache_factory,
@@ -56,15 +52,10 @@ def test_copy_blocks(
     kv_cache_dtype: str,
     device: str,
 ) -> None:
-    if is_hpu() and kv_cache_dtype != "auto":
-        pytest.skip("Only auto kv_cache_dtype supported on HPU")
-
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
@@ -87,24 +78,14 @@ def test_copy_blocks(
                                                 dtype, seed, device)
 
     # Clone the KV caches.
-    cloned_key_caches = [key_cache.clone().to("cpu") for key_cache in key_caches]
-    cloned_value_caches = [value_cache.clone().to("cpu") for value_cache in value_caches]
+    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
+    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 
     # Call the copy blocks kernel.
     block_mapping_tensor = torch.tensor(block_mapping,
                                         dtype=torch.int64,
                                         device=device).view(-1, 2)
-    if is_hpu():
-        tmp_block_mapping_dict = {}
-        for src, dst in block_mapping:
-            if not tmp_block_mapping_dict.get(src):
-                tmp_block_mapping_dict[src] = [dst]
-                continue
-            tmp_block_mapping_dict[src].append(dst)
-
-        ops.copy_blocks(key_caches, value_caches, tmp_block_mapping_dict)
-    else:
-        ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
+    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
 
     # Run the reference implementation.
     for src, dst in block_mapping:
@@ -128,7 +109,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_reshape_and_cache(
@@ -143,16 +124,11 @@ def test_reshape_and_cache(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
-    if is_hpu() and kv_cache_dtype != "auto":
-        pytest.skip("Only auto kv_cache_dtype supported on HPU")
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
-
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
@@ -182,8 +158,9 @@ def test_reshape_and_cache(
     kv_scale = 1.0
 
     # Call the reshape_and_cache kernel.
-    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping, "auto")
+    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+                          kv_cache_dtype, kv_scale)
+
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
         ops.convert_fp8(result_key_cache, key_cache)
@@ -191,23 +168,16 @@ def test_reshape_and_cache(
         ops.convert_fp8(result_value_cache, value_cache)
 
     # Run the reference implementation.
-    if not is_hpu():
-        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indices = block_indices.cpu().tolist()
+    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
+    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indicies_lst = block_indicies.cpu().tolist()
     block_offsets = slot_mapping % block_size
     block_offsets_lst = block_offsets.cpu().tolist()
     for i in range(num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        if is_hpu():
-            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
-        else:
-            cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
-        if is_hpu():
-            cloned_value_cache[block_idx, block_offset, :, :] = value[i]
-        else:
-            cloned_value_cache[block_idx, :, :, block_offset] = value[i]
+        block_idx = block_indicies_lst[i]
+        block_offset = block_offsets_lst[i]
+        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
+        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
 
     if kv_cache_dtype == "fp8":
         assert torch.allclose(result_key_cache,
@@ -223,7 +193,6 @@ def test_reshape_and_cache(
         assert torch.allclose(value_cache, cloned_value_cache)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -231,7 +200,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_reshape_and_cache_flash(
@@ -312,7 +281,7 @@ def test_reshape_and_cache_flash(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_swap_blocks(
@@ -328,23 +297,15 @@ def test_swap_blocks(
     device: str,
     kv_cache_dtype: str,
 ) -> None:
-    if is_hpu() and direction[0] == "hpu" and direction[1] == "cpu":
-        pytest.skip("Skipping test on HPU")
     if kv_cache_dtype == "fp8" and "cpu" in direction:
         pytest.skip()
     random.seed(seed)
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
 
-    if is_hpu():
-        src_device = device if direction[0] == "hpu" else 'cpu'
-        dst_device = device if direction[1] == "hpu" else 'cpu'
-    else:
-        src_device = device if direction[0] == "cuda" else 'cpu'
-        dst_device = device if direction[1] == "cuda" else 'cpu'
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'
 
     src_blocks = random.sample(range(num_blocks), num_mappings)
     # For the same device, mapping must not overlap
@@ -385,14 +346,13 @@ def test_swap_blocks(
                               dist_value_caches[0][dst].cpu())
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_fp8_e4m3_conversion(
     num_heads: int,
@@ -420,87 +380,3 @@ def test_fp8_e4m3_conversion(
     ops.convert_fp8(converted_cache, cache_fp8)
 
     assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)
-
-
-@pytest.mark.skipif(not is_hpu(), reason="This case is HPU-specific")
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
-@torch.inference_mode()
-def test_reshape_and_cache_prompt(
-    kv_cache_factory,
-    num_tokens: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
-    torch.set_default_device(device)
-
-    # Create a random slot mapping.
-    num_block_indices_to_generate = math.ceil(num_tokens / block_size)
-    block_indices_ = random.sample(range(num_blocks), num_block_indices_to_generate)
-    block_offsets_ = []
-    slot_mapping = []
-    for i in block_indices_:
-        for j in range(block_size):
-            slot_mapping.append(i * block_size + j)
-    slot_mapping = slot_mapping[:num_tokens]
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
-
-    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
-    _, key, value = qkv.unbind(dim=1)
-
-    # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
-                                                num_heads, head_size, dtype,
-                                                None, seed, device)
-    key_cache, value_cache = key_caches[0], value_caches[0]
-
-    # Clone the KV caches.
-    cloned_key_cache = key_cache.clone()
-    cloned_value_cache = value_cache.clone()
-
-    # Call the reshape_and_cache kernel.
-    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping.view((1, -1)), "auto", True)
-
-    # Run the reference implementation.
-    if is_hpu():
-        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape)
-    else:
-        reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indices = block_indices.cpu().tolist()
-    block_offsets = slot_mapping % block_size
-    block_offsets = block_offsets.cpu().tolist()
-    for i in range(0, num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        cloned_key_cache[block_idx, :, :, block_offset] = key[i, :, :]
-        cloned_value_cache[block_idx, :, :, block_offset] = value[i, :, :]
-
-    # Note: only checking cache areas specified by the slot mapping because
-    # the implementation may initialize whole blocks even if some of the offsets of the block
-    # are not present in the slot mapping.
-    for i in range(0, num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        assert torch.allclose(key_cache[block_idx, :, :, block_offset],
-                              cloned_key_cache[block_idx, :, :, block_offset])
-        assert torch.allclose(value_cache[block_idx, :, :, block_offset],
-                              cloned_value_cache[block_idx, :, :, block_offset])
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 0cd33494f9a1e..a635e6c12c594 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -2,7 +2,6 @@
 import torch
 
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import is_hpu
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
@@ -10,12 +9,9 @@
                 8199]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -23,7 +19,7 @@
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
     num_tokens: int,
@@ -33,18 +29,14 @@ def test_rms_norm(
     seed: int,
     device: str,
 ) -> None:
-    if is_hpu() and dtype == torch.half and add_residual:
-        pytest.skip("Skipping test on HPU")
     torch.random.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
-    elif is_hpu():
-        torch.hpu.manual_seed(seed)
     torch.set_default_device(device)
     layer = RMSNorm(hidden_size).to(dtype=dtype)
     layer.weight.data.normal_(mean=1.0, std=0.1)
     scale = 1 / (2 * hidden_size)
-    x = torch.randn(1, num_tokens, hidden_size, dtype=dtype, device=device)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     x *= scale
     residual = torch.randn_like(x) * scale if add_residual else None
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 8d52fbaa6cc25..2356b9ec18b0d 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -10,7 +10,6 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.models.mixtral import MixtralMoE
-from vllm.utils import is_hpu
 
 
 def torch_moe(a, w1, w2, score, topk):
@@ -30,7 +29,6 @@ def torch_moe(a, w1, w2, score, topk):
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", [512, 222, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 511, 1024])
@@ -55,7 +53,6 @@ def test_fused_moe(
     assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 0e08055bf12fe..4c83659929d41 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -5,7 +5,6 @@
 import torch
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.utils import is_hpu
 
 from .allclose_default import get_default_atol, get_default_rtol
 
@@ -17,15 +16,11 @@
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -34,7 +29,7 @@
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rotary_embedding(
     is_neox_style: bool,
@@ -82,7 +77,6 @@ def test_rotary_embedding(
                           rtol=get_default_rtol(out_key))
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -91,7 +85,7 @@ def test_rotary_embedding(
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_batched_rotary_embedding(
     is_neox_style: bool,
@@ -145,7 +139,6 @@ def test_batched_rotary_embedding(
                           rtol=get_default_rtol(out_key))
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -154,7 +147,7 @@ def test_batched_rotary_embedding(
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_batched_rotary_embedding_multi_lora(
     is_neox_style: bool,
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 1240411841d3f..99fda8364dc0e 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -4,16 +4,11 @@
 
 import pytest
 import torch
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 from vllm.attention.backends.xformers import _make_alibi_bias
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
-from vllm.utils import is_hpu
-if is_hpu():
-    from vllm.hpu import xops
-    from vllm.hpu.attn_bias import BlockDiagonalCausalFromBottomRightMask
-else:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
@@ -25,7 +20,6 @@
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py
index a96a238834a49..a4242d22eb489 100644
--- a/tests/kernels/test_rand.py
+++ b/tests/kernels/test_rand.py
@@ -5,10 +5,8 @@
 
 from vllm.model_executor.layers.ops.rand import seeded_uniform
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hpu
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("use_3d", [True, False])
diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py
index 4bab8caedbf62..e28f809309ec5 100644
--- a/tests/kernels/test_sampler.py
+++ b/tests/kernels/test_sampler.py
@@ -10,7 +10,6 @@
     sample)
 from vllm.model_executor.sampling_metadata import SamplingTensors
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hpu
 
 SINGLE_SPLIT_VOCAB_SIZE = 32000  # llama/mistral/mixtral vocab size
 MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100
@@ -31,7 +30,6 @@ def _uniform_to_exponential_kernel(input, output, n: tl.constexpr):
     tl.store(output + idx, y)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_uniform_to_exponential():
     """Test that we can convert uniform to exponential without div by 0."""
     input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps],
@@ -44,7 +42,6 @@ def test_uniform_to_exponential():
     assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output))
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
 @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("modify_greedy_probs", [True, False])
@@ -124,7 +121,6 @@ def test_sample_decoding_only(random_sampling, max_best_of,
         assert sampled_logprobs is None
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("random_sampling", [True, False, "mixed"])
 @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("modify_greedy_probs", [True, False])
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 4a00abd15266a..56cec4db89e64 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -4,7 +4,6 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -42,7 +41,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -65,7 +63,6 @@ def test_baichuan_lora(baichuan_lora_files):
         assert output2[i] == expected_lora_output[i]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skip("Requires multiple GPUs")
 @pytest.mark.parametrize("fully_sharded", [True, False])
 def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded):
diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py
index c17ebac3e4543..de4cbea80924e 100644
--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
@@ -1,9 +1,7 @@
-import pytest
 from typing import List
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 MODEL_PATH = "THUDM/chatglm3-6b"
 
@@ -39,7 +37,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 6022b82e8a7cb..709246179bfe4 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,9 +1,7 @@
-import pytest
 from typing import List
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 MODEL_PATH = "google/gemma-7b"
 
@@ -30,7 +28,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_gemma_lora(gemma_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index 8b6585e4cf76e..ec9776b77df76 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -8,7 +8,6 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -71,7 +70,6 @@ def do_sample(llm: vllm.LLM,
 # step 1: init a base model and serve with LoRA to get the reference results
 # step 2: merge the same LoRA to the base model, serve the merged model
 # step 3: compare the results from step 1 and step 2
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("tp_size", [1])
 @pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
 @pytest.mark.parametrize("rank", [8, 16, 32, 64])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 2908bc7ee70af..2e51e95a38f2e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -36,7 +36,6 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hpu
 
 from .utils import DummyLoRAManager
 
@@ -179,7 +178,6 @@ def create_random_inputs(
     return inputs, index_mapping, prompt_mapping
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -274,7 +272,6 @@ def create_random_embedding_layer():
                               atol=atol)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
@@ -412,7 +409,6 @@ def create_random_embedding_layer():
                               atol=atol)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -537,7 +533,6 @@ def _pretest():
                               atol=atol)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
@@ -649,7 +644,6 @@ def create_random_linear_parallel_layer():
                               atol=atol)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index 42d7e7ab78e16..ad8490353998f 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -5,7 +5,6 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -38,7 +37,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     return generated_texts
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.parametrize("tp_size", [1, 2, 4])
 def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
     if num_gpus_available < tp_size:
@@ -81,7 +80,7 @@ def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
 
     print("removing lora")
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
     if num_gpus_available < 4:
         pytest.skip("Not enough GPUs for tensor parallelism 4")
@@ -121,7 +120,6 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
     assert output_tp1 == output_tp4
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
     is more conservative"""
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
index 90363305e137c..3415d36b7e341 100644
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
@@ -2,7 +2,6 @@
 import torch
 
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
-from vllm.utils import is_hpu
 
 from .utils import DummyLoRAManager
 
@@ -22,7 +21,6 @@
 }
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
@@ -73,7 +71,6 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
     manager.reset_lora()
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("m", TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
@@ -143,7 +140,6 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
     manager.reset_lora()
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
 @pytest.mark.parametrize("n", TENSOR_SIZES)
 @pytest.mark.parametrize("k", BATCH_SIZES)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index f25e55e0b2ea3..2133bce14957b 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -17,7 +17,6 @@
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.utils import is_hpu
 
 EMBEDDING_MODULES = {
     "embed_tokens": "input_embeddings",
@@ -27,7 +26,6 @@
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_from_lora_tensors(sql_lora_files):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
@@ -100,7 +98,6 @@ def create_packed_lora(
     return LoRAModel(lora_id, 8, loras)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "layer1.dense2"]
@@ -119,7 +116,6 @@ def test_replace_submodules(dist_init, dummy_model):
                       RowParallelLinearWithLoRA)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lora_model_manager(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -166,7 +162,6 @@ def test_lora_model_manager(dist_init, dummy_model):
     assert manager.lora_index_to_id[1] == 2
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -244,7 +239,6 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
         assert manager.pin_lora(3)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lru_lora_model_manager(dist_init, dummy_model):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
@@ -359,7 +353,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
     assert set(manager.list_loras()) == {1}
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                        sql_lora_files):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
@@ -433,7 +426,6 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                              sql_lora_files):
     # Should remove every LoRA not specified in the request.
@@ -504,7 +496,6 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
         ], mapping)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_packed_loras(dist_init, dummy_model_gate_up):
     model = dummy_model_gate_up
     model.supported_lora_modules = ["gate_up_proj"]
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index fccfcb1864422..dbeb16cb21ad3 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -4,7 +4,6 @@
 import torch
 
 import vllm.lora.punica as punica
-from vllm.utils import is_hpu
 
 
 def assert_close(a, b):
@@ -130,7 +129,6 @@ def _lora_ref_impl(
 ]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("r", R)
@@ -173,7 +171,6 @@ def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
         assert_close(y_ref, y_our)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("h2", H2)
@@ -208,7 +205,6 @@ def test_lora_correctness(dtype_str, h1, h2, seed, device):
         assert_close(y_ref, y_our)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
 @pytest.mark.parametrize("h1", H1)
 @pytest.mark.parametrize("h2", H2)
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 73d87bda255f4..8fd968c69e58f 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -7,7 +7,6 @@
 
 import vllm
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hpu
 
 from .conftest import cleanup
 
@@ -58,7 +57,6 @@ def format_prompt_tuples(prompt):
     return generated_texts
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 943a9170605c2..732e91a52c0a9 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,17 +3,13 @@
 import tempfile
 from unittest.mock import patch
 
-import pytest
-
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
-from vllm.utils import is_hpu
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
     worker = Worker(
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 13e910e74fff7..0191d85194e33 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -7,14 +7,12 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_hpu
 
 MODELS = [
     "facebook/opt-125m",
 ]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -50,7 +48,6 @@ def test_metric_counter_prompt_tokens(
         f"metric: {metric_count!r}")
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -83,7 +80,6 @@ def test_metric_counter_generation_tokens(
         f"metric: {metric_count!r}")
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 4cec529a2f5c3..c3e48b56ee58f 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -6,7 +6,6 @@
 """
 import pytest
 import torch
-from vllm.utils import is_hpu
 
 from .utils import check_outputs_equal
 
@@ -26,7 +25,6 @@
     target_dtype = "half"
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -51,7 +49,7 @@ def test_models(
         name_1="vllm",
     )
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 6c83f711b62af..b4220dc599551 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -4,7 +4,6 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.utils import is_hpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_outputs_equal
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index f30384a85ed0d..6acc057fe588c 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -3,7 +3,6 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
-from vllm.utils import is_hpu
 
 from .utils import check_logprobs_close
 
@@ -13,7 +12,6 @@
 ]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 927c5569a9a33..4cd2cb665c8f0 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -6,7 +6,6 @@
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
-from vllm.utils import is_hpu
 
 from .utils import check_outputs_equal
 
@@ -24,7 +23,6 @@
 ]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [96])
@@ -53,7 +51,6 @@ def test_models(
     )
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_model_print(
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index f03c657dac4a2..50ab06631500b 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,4 +1,3 @@
-import pytest
 import torch
 
 from vllm import LLM, ModelRegistry, SamplingParams
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 70ef3c2cfcbf0..b63a8d01d6621 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -9,7 +9,7 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.utils import is_hpu
+
 
 @dataclass
 class ModelPair:
@@ -54,8 +54,7 @@ class ModelPair:
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
 def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
-    if is_hpu() and model_path in ('TheBloke/Llama-2-7B-Chat-GPTQ', 'LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit'):
-        pytest.skip("Skipping test on HPU")
+
     try:
         model_config = ModelConfig(model_path,
                                    model_path,
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index f2ac194be59d4..64f3ce94b7a83 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -4,7 +4,6 @@
 """
 
 import pytest
-from vllm.utils import is_hpu
 
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -15,7 +14,6 @@
 MODELS = ["facebook/opt-125m"]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 3a66a4a48772f..2979470120710 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -2,12 +2,10 @@
 import torch
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 MODELS = ["facebook/opt-125m"]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_logits_processor_force_generate(
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index f121a809be380..02a953da04659 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -4,14 +4,12 @@
 import torch
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 from ..conftest import VllmRunner
 
 MODELS = ["facebook/opt-125m"]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 9b874722e3cfd..ed2fee1ae252e 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -1,12 +1,10 @@
 import pytest
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 MODELS = ["facebook/opt-125m"]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 41f095b18c8bd..6dd643bbea2bb 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -7,14 +7,10 @@
 
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hpu
 
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 def mock_causal_accepted_tensor(
@@ -42,7 +38,6 @@ def mock_causal_accepted_tensor(
     return accepted
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", list(range(10)))
 @pytest.mark.parametrize(
     "which_tokens_accepted",
@@ -134,11 +129,10 @@ def test_correct_output_format(which_tokens_accepted: str,
         assert torch.all(output_token_ids[subsequent_mask] == -1)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     device: str):
@@ -161,11 +155,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                       draft_token_ids)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
 @pytest.mark.parametrize("which_token_ids",
                          ["bonus_token_ids", "draft_token_ids"])
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                                which_token_ids: str, device: str):
@@ -210,7 +203,6 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
                           draft_token_ids)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
 @pytest.mark.parametrize("seed", list(range(5)))
 @torch.inference_mode()
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index d64f3c4e6fd13..9572588ce6e53 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available, is_hpu
+from vllm.utils import Counter, is_pin_memory_available
 
 
 class MockLogitsSampler(Sampler):
@@ -37,12 +37,9 @@ def _prepare_test(
 
 VOCAB_SIZE = 32000
 RANDOM_SEEDS = list(range(128))
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
 def _do_sample(
@@ -75,7 +72,7 @@ def _do_sample(
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_greedy(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -92,7 +89,7 @@ def test_sampler_all_greedy(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_random(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -114,9 +111,8 @@ def test_sampler_all_random(seed: int, device: str):
             assert nth_output.output_token == i
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_random_seed(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -139,9 +135,8 @@ def test_sampler_all_random_seed(seed: int, device: str):
             assert nth_output.output_token == i
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_random_seed_deterministic(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -163,7 +158,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_beam(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -183,7 +178,7 @@ def test_sampler_all_beam(seed: int, device: str):
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_min_tokens_penalty(seed: int, device: str):
     seq_id_counter = Counter(start=random.randint(0, 100))
     set_random_seed(seed)
@@ -468,9 +463,8 @@ def run_test_case(*, expected_penalization: List[bool],
         run_test_case(**test_case)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_mixed(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
@@ -572,7 +566,7 @@ def test_sampling():
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_top_k_top_p(seed: int, device: str):
     set_random_seed(seed)
     batch_size = random.randint(1, 256)
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 6fed73ec7b3ce..88067f19c8f07 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -10,7 +10,6 @@
 
 from vllm import SamplingParams
 from vllm.model_executor.utils import set_random_seed
-from vllm.utils import is_hpu
 
 MODEL = "facebook/opt-125m"
 RANDOM_SEEDS = list(range(5))
@@ -22,7 +21,6 @@ def vllm_model(vllm_runner):
         yield vllm_model
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_random_sample_with_seed(
     vllm_model,
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index c39a143ba3371..81f91c5e10b0d 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -1,12 +1,10 @@
 import pytest
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 from .conftest import get_output_from_llm_generator
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -46,7 +44,6 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
                                       sampling_params)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -99,7 +96,6 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
                                       sampling_params)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("common_llm_kwargs", [{
     "model": "JackFram/llama-68m",
     "speculative_model": "JackFram/llama-68m",
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 882cb8dd9dbac..9572aac7df6e0 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -4,12 +4,10 @@
 import pytest
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 from .conftest import get_logprobs_from_llm_generator
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -47,7 +45,6 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -89,7 +86,6 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator,
                                          logprob_rank=num_logprobs)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -129,7 +125,6 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -173,7 +168,6 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index ca9158ec72a08..94cc36f22875a 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -40,13 +40,11 @@
 from transformers import AutoTokenizer
 
 from vllm import SamplingParams
-from vllm.utils import is_hpu
 
 from .conftest import (get_output_from_llm_generator,
                        run_greedy_equality_correctness_test)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -118,7 +116,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         assert actual_tokens.strip() == expected_tokens.strip()
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -157,7 +154,6 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -212,7 +208,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -264,7 +259,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -311,7 +305,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
                                          force_output_len=False)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -356,7 +349,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -401,7 +393,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -449,7 +440,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -503,7 +493,6 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -553,7 +542,6 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 179125891e74d..d475d37af6425 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -26,12 +26,9 @@
 
 import pytest
 
-from vllm.utils import is_hpu
-
 from .conftest import run_greedy_equality_correctness_test
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -73,7 +70,6 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
@@ -123,7 +119,6 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
                                          force_output_len=True)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index 0fbd4fefbde7c..42dd90422ec47 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -4,7 +4,6 @@
 import torch
 
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
-from vllm.utils import is_hpu
 
 from .utils import create_seq_group_metadata_from_prompts, mock_worker
 
@@ -30,7 +29,6 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int):
             assert next(iterator) > max_seq_id
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.skip_global_cleanup
 def test_get_token_ids_to_score(k: int):
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index adab972e610f8..2918fabddc900 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -5,10 +5,8 @@
 import torch
 
 from vllm.spec_decode.metrics import AsyncMetricsCollector
-from vllm.utils import is_hpu
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
@@ -27,7 +25,6 @@ def test_initial_call_returns_none():
     assert maybe_metrics is None
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
@@ -55,7 +52,6 @@ def test_second_call_returns_metrics():
     assert metrics is not None
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("rank", [1, 2, 3, 4])
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
@@ -76,7 +72,6 @@ def test_nonzero_rank_noop(rank):
     assert metrics is None
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
@@ -110,7 +105,6 @@ def test_noop_until_time():
     assert metrics is not None
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("has_data", [True, False])
 def test_initial_metrics_has_correct_values(has_data: bool):
     """Test correctness of metrics data.
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 81c6763ebc64b..7744b2640fe94 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -10,7 +10,6 @@
 from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.utils import is_hpu
 from vllm.worker.worker import Worker
 
 from .utils import (assert_logprobs_dict_allclose, create_batch,
@@ -71,7 +70,6 @@ def test_assert_enough_kv_space(num_steps: int):
         seq_group_metadata.block_tables = original_block_tables
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_same_output_for_single_step():
     """Verify the multi step worker produces the same output as the normal
@@ -155,7 +153,6 @@ def test_same_output_for_single_step():
     assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_same_output_for_multi_step():
     """Verify the multi-step worker produces the same output as the normal
@@ -280,7 +277,6 @@ def test_same_output_for_multi_step():
                                       single_step_logprobs)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_full_speculation_len():
     """Verify Top1Proposer correctly handles case where all sequences
@@ -334,7 +330,6 @@ def test_draft_proposals_full_speculation_len():
     assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_no_speculations():
     """Verify Top1Proposer correctly handles case where no sequences
@@ -373,7 +368,6 @@ def test_draft_proposals_no_speculations():
     assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @torch.inference_mode()
 def test_draft_proposals_mixed_k():
     """Verify Top1Proposer correctly handles case some sequences can
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index e7968cf0c7737..b1537884f896e 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -1,15 +1,12 @@
 import torch
 
-import pytest
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.ngram_worker import NGramWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
-from vllm.utils import is_hpu
 
 from .utils import create_seq_group_metadata_from_prompts, create_worker
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_single_no_match():
     """Verify our ngram algo find the right candidate in the prompt
 
@@ -67,7 +64,6 @@ def test_ngram_algo_correctness_for_single_no_match():
     assert proposals.proposal_lens.tolist() == [0]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_batches_not_match_all():
     """Verify our ngram algo find the right candidate in the prompt
 
@@ -146,7 +142,6 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
         assert proposals.proposal_token_ids[4][i] == -1
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_ngram_algo_correctness_for_batches_match_all():
     """Verify our ngram algo find the right candidate in the prompt
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 6b17af0b767b8..527e7eddd7e33 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -14,12 +14,11 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
-from vllm.utils import is_hpu
 
 from .test_utils import mock_spec_decode_sampler
 from .utils import create_batch, create_sampler_output_list, mock_worker
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -54,7 +53,6 @@ def test_correctly_calls_draft_model(k: int, batch_size: int,
         assert actual_execute_model_data == execute_model_req
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -137,7 +135,6 @@ def test_correctly_calls_target_model(k: int, batch_size: int,
     assert expected_seen_contexts == seen_contexts
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -229,7 +226,6 @@ def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int,
     assert torch.equal(actual.draft_probs, proposal_probs)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2, 6])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -358,7 +354,6 @@ def test_correctly_formats_output(k: int, batch_size: int,
                 i].output_token
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [1, 2])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('returns_metrics', [True, False])
@@ -456,7 +451,6 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
     assert args[0] == k or kwargs.get('k', -1) == k
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [0])
 @pytest.mark.parametrize('batch_size', [1, 2, 32])
 @pytest.mark.parametrize("acceptance_sampler_method",
@@ -501,7 +495,6 @@ def test_k_equals_zero(k: int, batch_size: int,
     target_worker.execute_model.assert_called_once_with(execute_model_req)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize('k', [0, 5])
 @pytest.mark.parametrize('batch_size', [0])
 @pytest.mark.parametrize("acceptance_sampler_method",
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 12c6d1d5c7b3f..c8f86133f41ac 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -20,7 +20,6 @@
                                                          open_stream,
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
-from vllm.utils import is_hpu
 
 from ..conftest import VllmRunner, cleanup
 from ..utils import RemoteOpenAIServer
@@ -86,8 +85,7 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config):
     assert result == mock_agent_instance.deserialize.return_value
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
-s@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
@@ -105,7 +103,6 @@ def test_can_deserialize_s3(vllm_runner):
         assert deserialized_outputs
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
@@ -137,7 +134,6 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
         assert outputs == deserialized_outputs
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                 tmp_path):
     with hf_runner(model_ref) as hf_model:
@@ -161,7 +157,6 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
         assert outputs == deserialized_outputs
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     from huggingface_hub import snapshot_download
 
@@ -198,7 +193,6 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
         assert loaded_vllm_model
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_load_without_tensorizer_load_format(vllm_runner):
     with pytest.raises(ValueError):
         vllm_runner(
@@ -206,7 +200,6 @@ def test_load_without_tensorizer_load_format(vllm_runner):
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     ## Serialize model
@@ -244,7 +237,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         completion_tokens=5, prompt_tokens=6, total_tokens=11)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_raise_value_error_on_invalid_load_format(vllm_runner):
     with pytest.raises(ValueError):
         vllm_runner(
diff --git a/tests/test_config.py b/tests/test_config.py
index 84ba9eec27969..6c8af9d7966b4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,6 @@
 import pytest
+
 from vllm.config import ModelConfig
-from vllm.utils import is_hpu
 
 MODEL_IDS_EXPECTED = [
     ("Qwen/Qwen1.5-7B", 32768),
@@ -25,7 +25,6 @@ def test_disable_sliding_window(model_id_expected):
     assert model_config.max_model_len == expected
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_get_sliding_window():
     TEST_SLIDING_WINDOW = 4096
     # Test that the sliding window is correctly computed.
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index dfae9c6ef61a5..4ee980505a3ab 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -9,7 +9,6 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_hpu
 from vllm.utils import is_pin_memory_available
 
 
@@ -43,17 +42,13 @@ def _prepare_test(
 
 
 RANDOM_SEEDS = list(range(128))
-if is_hpu():
-    DEVICES = ["hpu"]
-else:
-    DEVICES = [
-        f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-    ]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_logits_processors(seed: int, device: str):
     set_random_seed(seed)
     torch.set_default_device(device)
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index c3bbc110fd69d..12e5ae85adea6 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -7,7 +7,6 @@
 from vllm.transformers_utils.detokenizer import (Detokenizer,
                                                  detokenize_incrementally)
 from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
-from vllm.utils import is_hpu
 
 TRUTH = [
     "Hello here, this is a simple test",
@@ -56,8 +55,6 @@ def _run_incremental_decode(tokenizer, all_input_ids,
 @pytest.mark.parametrize("skip_special_tokens", (True, False))
 def test_decode_streaming(tokenizer_id, truth, with_prompt,
                           skip_special_tokens):
-    if is_hpu() and tokenizer_id == "meta-llama/Llama-2-7b-hf":
-        pytest.skip("Skipping test on HPU")
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
     if with_prompt:
         truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"]
@@ -117,8 +114,6 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
                                        tokenizer_name: str) -> List[int]:
-    if is_hpu() and tokenizer_name == "meta-llama/Llama-2-7b-hf":
-        pytest.skip("Skipping test on HPU")
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"]
     return complete_sequence_token_ids
@@ -152,8 +147,6 @@ def test_decode_sequence_logprobs(complete_sequence: str,
                                   detokenizer: Detokenizer,
                                   skip_special_tokens: bool):
     """Verify Detokenizer decodes logprobs correctly."""
-    if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf":
-        pytest.skip("Skipping test on HPU")
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      logprobs=2)
 
@@ -190,8 +183,6 @@ def test_decode_prompt_logprobs(complete_sequence: str,
                                 detokenizer: Detokenizer,
                                 skip_special_tokens: bool):
     """Verify Detokenizer decodes prompt logprobs correctly."""
-    if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf":
-        pytest.skip("Skipping test on HPU")
     sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
                                      prompt_logprobs=1)
 
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index c766c69874755..e1775790c0a03 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -8,7 +8,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import get_open_port, is_hpu
+from vllm.utils import get_open_port
 from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size
 
 
@@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
     )
     return model_runner
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
+
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_prompt(batch_size):
     model_runner = _create_model_runner(
@@ -142,7 +142,6 @@ def test_prepare_prompt(batch_size):
     torch.testing.assert_close(actual, expected)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_decode_cuda_graph(batch_size):
     model_runner = _create_model_runner(
@@ -253,7 +252,6 @@ def test_prepare_decode_cuda_graph(batch_size):
     torch.testing.assert_close(actual, expected)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_empty_seq_group():
     """Verify prepare prompt and decode returns empty output."""
     model_runner = _create_model_runner(
@@ -298,7 +296,6 @@ def distributed_init():
     ensure_model_parallel_initialized(1, 1)
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 @pytest.mark.parametrize("batch_size", list(range(2, 128)))
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index b831370fe81ad..d941ffdb5588a 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -1,13 +1,11 @@
 import torch
 
-import pytest
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port, is_hpu
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.worker import Worker
 
 
-@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU")
 def test_swap() -> None:
     # Configure the engine.
     engine_args = EngineArgs(model="facebook/opt-125m",

From 85af27e06082a9dd9a3324806f67ca513db2b315 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 13:36:30 +0300
Subject: [PATCH 055/341] cleanup

---
 vllm/hpu/utils.py | 83 +----------------------------------------------
 1 file changed, 1 insertion(+), 82 deletions(-)

diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
index 8d7f388cf262a..4ce9e2591c6b9 100644
--- a/vllm/hpu/utils.py
+++ b/vllm/hpu/utils.py
@@ -15,85 +15,4 @@ def wrapped(*args, **kwargs):
         del kwargs
         htorch.core.mark_step()
         return result
-    return wrapped
-
-
-def profile_reicpes(recipe_names):
-    from pathlib import Path
-    import numpy as np
-    import matplotlib.pyplot as plt
-    from sklearn.metrics import ConfusionMatrixDisplay
-    import tqdm
-    recipe_names_short = [name.replace('.graph_dumps/HabanaFusedOpLazy_', '') for name in recipe_names]
-    recipes = [Path(Path.cwd().joinpath(name + '-PostGraph-symbol.pbtxt')).open('r').read() for name in recipe_names]
-
-    def generic_similarity_backend(recipes, similarity_func, backend_name=''):
-        num_recipes = len(recipes)
-        sim_tri = np.zeros((num_recipes, num_recipes))
-        total = (num_recipes * (num_recipes + 1)) // 2 - num_recipes
-        backend_txt = f' with {backend_name}' if backend_name != '' else ''
-        with tqdm.tqdm(total=total, desc=f" computing similarity matrix{backend_txt}") as pbar:
-            for i in range(num_recipes):
-                for j in range(i):
-                    sim_tri[i,j] = similarity_func(recipes[i], recipes[j])
-                    pbar.update(1)
-        sim = sim_tri.T + sim_tri
-        sim_idx = np.arange(sim_tri.shape[0])
-        sim[sim_idx,sim_idx] = 1
-        return sim 
-
-    def cosine_similarity_rad_backend(recipes):
-        from strsimpy.cosine import Cosine
-        s = Cosine(2)
-        return generic_similarity_backend(recipes, s.similarity, "Cosine (rad)"), "cosine similarity, 1 = max similarity"
-
-    def cosine_similarity_deg_backend(recipes):
-        from strsimpy.cosine import Cosine
-        s = Cosine(2)
-        rad = generic_similarity_backend(recipes, s.similarity, "cosine similarity")
-        deg = np.degrees(np.arccos(rad))
-        return deg, "cosine similarity (deviation in deg, 0 = max similarity)"
-
-    def overlap_coefficient_backend(recipes):
-        from strsimpy.overlap_coefficient import OverlapCoefficient
-        s = OverlapCoefficient(2)
-        return generic_similarity_backend(recipes, s.similarity, OverlapCoefficient.__name__),  OverlapCoefficient.__name__
-
-    def normalized_levenshtein_backend(recipes):
-        from strsimpy.normalized_levenshtein import NormalizedLevenshtein
-        s = NormalizedLevenshtein()
-        return generic_similarity_backend(recipes, s.similarity, NormalizedLevenshtein.__name__), NormalizedLevenshtein.__name__
-
-    def jaro_winkler_backend(recipes):
-        from strsimpy.jaro_winkler import JaroWinkler
-        s = JaroWinkler()
-        return generic_similarity_backend(recipes, s.similarity, JaroWinkler.__name__), JaroWinkler.__name__
-    
-    def tfidf_weird_backend(recipes):
-        def tfidf_single_elem(x,y):
-            from sklearn.feature_extraction.text import TfidfVectorizer
-            vect = TfidfVectorizer() 
-            tfidf = vect.fit_transform([x,y])                                                                                                                                                                                                                       
-            sim_sparse = tfidf * tfidf.T 
-            sim = sim_sparse.toarray()
-            return sim[0,1]
-        return generic_similarity_backend(recipes, tfidf_single_elem, 'TfidfVectorizer (weird)'), 'TfidfVectorizer (weird)'
-    
-    def tfidf_backend(recipes):
-        from sklearn.feature_extraction.text import TfidfVectorizer
-        vect = TfidfVectorizer() 
-        tfidf = vect.fit_transform(recipes)                                                                                                                                                                                                                       
-        sim_sparse = tfidf * tfidf.T 
-        sim = sim_sparse.toarray()
-        return sim, 'TfidfVectorizer'
-    
-    sim, backend_name = tfidf_backend(recipes)
-    plt.rcParams["figure.figsize"] = [16,16]
-    plt.rcParams["figure.dpi"] = 300
-    cm = ConfusionMatrixDisplay(sim, display_labels=recipe_names_short)
-    cm.plot(xticks_rotation='vertical', text_kw={"fontsize":5})
-    cm.ax_.set_xlabel("Target recipe number")
-    cm.ax_.set_ylabel("Source recipe number")
-    plt.title(f'Recipe similarity ({backend_name})')
-    return plt
-#    plt.savefig('similarity.png')
\ No newline at end of file
+    return wrapped
\ No newline at end of file

From f856a85adb003e8fedeb69bf7cd811a0fff12ea0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 13:38:06 +0300
Subject: [PATCH 056/341] llm engine cleanup

---
 vllm/engine/llm_engine.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f98fb58592a32..96ac0bdd59012 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -408,6 +408,7 @@ def from_engine_args(
         else:
             from vllm.executor.gpu_executor import GPUExecutor
             executor_class = GPUExecutor
+
         # Create the LLM engine.
         engine = cls(
             **engine_config.to_dict(),
@@ -838,6 +839,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         request_outputs = self._process_model_outputs(
             output, scheduler_outputs.scheduled_seq_groups,
             scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
+
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
@@ -852,14 +854,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
             # queued control plane messages, such as add/remove lora adapters.
             self.model_executor.stop_remote_worker_execution_loop()
 
-#        out_prompt = [ro.prompt for ro in request_outputs]
-#        out_indices =  [ro.outputs[-1].index for ro in request_outputs]
-#        out_text =  [f'{ro.outputs[-1].text!r}' for ro in request_outputs]
-#        for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)):
-#            logger.info(f'\tPROMPT ({idx}): {p}')            
-#            logger.info(f'\tGEN IDX ({idx}): {i}')            
-#            logger.info(f'\tGEN TXT ({idx}): {t}')            
-#            logger.info('')            
         return request_outputs
 
     def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:

From b1f8b71e5bda8c71f0f9ebf806c9db36a47ffa13 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 13:39:20 +0300
Subject: [PATCH 057/341] utils.py cleanup

---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 4e745ab96bc4c..2fb77a0fc431c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -517,7 +517,7 @@ def create_kv_caches_with_random(
                                 dtype=torch_dtype,
                                 device=device)
         cache_dtype = str(cache_dtype)
-        if cache_dtype in ["auto", "half", "float16", "torch.float16", "torch.bfloat16", "torch.float32"]:
+        if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]:
             key_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
             _generate_random_fp8(key_cache, -scale, scale)

From fb744547dc8a7d4e4d650516b95d826b68bf3e2d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 13:54:40 +0300
Subject: [PATCH 058/341] custom ops refactor

---
 vllm/_custom_ops.py                      | 74 +++++++++++-------------
 vllm/hpu/ops.py                          | 16 ++++-
 vllm/model_executor/custom_op.py         |  6 +-
 vllm/model_executor/layers/activation.py |  9 +++
 4 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e4d07bae6dd11..479ea08e49072 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,6 +1,5 @@
 import contextlib
 import functools
-import importlib
 from typing import List, Optional, Tuple, Type
 
 import torch
@@ -45,33 +44,26 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-_ops = torch.ops._C
-_cache_ops = torch.ops._C_cache_ops
-if importlib.util.find_spec('habana_frameworks') is not None:
-    from vllm.hpu import ops as vllm_ops
-    from vllm.hpu import cache_ops as vllm_cache_ops
-    _ops = vllm_ops
-    _cache_ops = vllm_cache_ops
 
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    _ops.silu_and_mul(out, x)
+    torch.ops._C.silu_and_mul(out, x)
 
 
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    _ops.gelu_and_mul(out, x)
+    torch.ops._C.gelu_and_mul(out, x)
 
 
 def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    _ops.gelu_tanh_and_mul(out, x)
+    torch.ops._C.gelu_tanh_and_mul(out, x)
 
 
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    _ops.gelu_fast(out, x)
+    torch.ops._C.gelu_fast(out, x)
 
 
 def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    _ops.gelu_new(out, x)
+    torch.ops._C.gelu_new(out, x)
 
 
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
@@ -99,7 +91,7 @@ def paged_attention_v1(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    _ops.paged_attention_v1(
+    torch.ops._C.paged_attention_v1(
         out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
         kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
@@ -129,7 +121,7 @@ def paged_attention_v2(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    _ops.paged_attention_v2(
+    torch.ops._C.paged_attention_v2(
         out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
         alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
@@ -146,7 +138,7 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    _ops.rotary_embedding(positions, query, key, head_size,
+    torch.ops._C.rotary_embedding(positions, query, key, head_size,
                                   cos_sin_cache, is_neox)
 
 
@@ -155,7 +147,7 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                              cos_sin_cache: torch.Tensor, is_neox: bool,
                              rot_dim: int,
                              cos_sin_cache_offsets: torch.Tensor) -> None:
-    _ops.batched_rotary_embedding(positions, query, key, head_size,
+    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
                                           cos_sin_cache, is_neox, rot_dim,
                                           cos_sin_cache_offsets)
 
@@ -163,12 +155,12 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
 # layer norm ops
 def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
              epsilon: float) -> None:
-    _ops.rms_norm(out, input, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                        weight: torch.Tensor, epsilon: float) -> None:
-    _ops.fused_add_rms_norm(input, residual, weight, epsilon)
+    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
 # quantization ops
@@ -176,13 +168,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                    zeros: torch.Tensor, split_k_iters: int, thx: int,
                    thy: int) -> torch.Tensor:
-    return _ops.awq_dequantize(qweight, scales, zeros, split_k_iters,
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
                                        thx, thy)
 
 
 def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
              scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
-    return _ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
 
 
 # gptq
@@ -190,26 +182,26 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
               b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
               b_g_idx: torch.Tensor, use_exllama: bool,
               bit: int) -> torch.Tensor:
-    return _ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
                                   b_g_idx, use_exllama, bit)
 
 
 def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                  bit: int) -> None:
-    _ops.gptq_shuffle(q_weight, q_perm, bit)
+    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
 # squeezellm
 def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
                     lookup_table: torch.Tensor) -> None:
-    _ops.squeezellm_gemm(vec, mat, mul, lookup_table)
+    torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table)
 
 
 # marlin
 def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                 b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
                 size_n: int, size_k: int) -> torch.Tensor:
-    return _ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
                                     size_n, size_k)
 
 
@@ -218,7 +210,7 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
                         workspace: torch.Tensor, num_bits: int, size_m: int,
                         size_n: int, size_k: int) -> torch.Tensor:
-    return _ops.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
+    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
                                             workspace, num_bits, size_m,
                                             size_n, size_k)
 
@@ -241,7 +233,7 @@ def cutlass_scaled_mm(a: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    _ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
 
     return out
 
@@ -251,13 +243,13 @@ def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
               codebook_partition_sizes: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return _ops.aqlm_gemm(input, codes, codebooks, scales,
+    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
                                   codebook_partition_sizes, bias)
 
 
 def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
                  codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
-    return _ops.aqlm_dequant(codes, codebooks,
+    return torch.ops._C.aqlm_dequant(codes, codebooks,
                                      codebook_partition_sizes)
 
 
@@ -265,7 +257,7 @@ def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                        size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
-    return _ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
                                            num_bits)
 
 
@@ -274,7 +266,7 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      perm: torch.Tensor, workspace: torch.Tensor,
                      num_bits: int, size_m: int, size_n: int, size_k: int,
                      is_k_full: bool) -> torch.Tensor:
-    return _ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
                                          workspace, num_bits, size_m, size_n,
                                          size_k, is_k_full)
 
@@ -313,9 +305,9 @@ def scaled_fp8_quant(
         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        _ops.dynamic_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
-        _ops.static_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
@@ -338,14 +330,14 @@ def scaled_int8_quant(
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
-        _ops.static_scaled_int8_quant(output, input, scale)
+        torch.ops._C.static_scaled_int8_quant(output, input, scale)
         return output, scale
 
     # dynamic-per-token quantization.
     input_scales = torch.empty((input.numel() // input.shape[-1], 1),
                                device=input.device,
                                dtype=torch.float32)
-    _ops.dynamic_scaled_int8_quant(output, input, input_scales)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales)
     return output, input_scales
 
 
@@ -354,7 +346,7 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
                          experts_ids: torch.Tensor,
                          num_tokens_post_pad: torch.Tensor) -> None:
-    _ops.moe_align_block_size(topk_ids, num_experts, block_size,
+    torch.ops._C.moe_align_block_size(topk_ids, num_experts, block_size,
                                       sorted_token_ids, experts_ids,
                                       num_tokens_post_pad)
 
@@ -375,7 +367,7 @@ def reshape_and_cache(
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
-    _cache_ops.reshape_and_cache(key, value, key_cache,
+    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
                                              value_cache, slot_mapping,
                                              kv_cache_dtype, kv_scale)
 
@@ -388,7 +380,7 @@ def reshape_and_cache_flash(
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
 ) -> None:
-    _cache_ops.reshape_and_cache_flash(key, value, key_cache,
+    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                    value_cache, slot_mapping,
                                                    kv_cache_dtype)
 
@@ -396,19 +388,19 @@ def reshape_and_cache_flash(
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
-    _cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    _cache_ops.swap_blocks(src, dst, block_mapping)
+    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
 
 
 def convert_fp8(output: torch.Tensor,
                 input: torch.Tensor,
                 scale: float = 1.0,
                 kv_dtype: str = "fp8") -> None:
-    _cache_ops.convert_fp8(output, input, scale, kv_dtype)
+    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
 
 
 def get_device_attribute(attribute: int, device: int) -> int:
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 1f2e07bd59ccb..c91f8c6a86afe 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -26,12 +26,24 @@ def silu_and_mul(output, input):
     output.copy_(silu(x) * y)
 
 
+def gelu_and_mul(output, input):
+    raise NotImplementedError("gelu_and_mul is not implemented for HPU backend")
+
+
+def gelu_tanh_and_mul(output, input):
+    raise NotImplementedError("gelu_tanh_and_mul is not implemented for HPU backend")
+
+
 def gelu_new(output, input):
-    raise NotImplementedError
+    raise NotImplementedError("gelu_new is not implemented for HPU backend")
 
 
 def gelu_fast(output, input):
-    raise NotImplementedError
+    raise NotImplementedError("gelu_fast is not implemented for HPU backend")
+
+
+def gelu_quick(output, input):
+    raise NotImplementedError("gelu_quick is not implemented for HPU backend")
 
 
 def fetch_from_cache(cache, blocks, permutations):
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 5276ada2a3086..d474490b98797 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -31,9 +31,6 @@ def forward_hip(self, *args, **kwargs):
     def forward_xpu(self, *args, **kwargs):
         raise NotImplementedError
 
-    def forward_hpu(self, *args, **kwargs):
-        return self.forward_cuda(*args, **kwargs)
-
     def forward_cpu(self, *args, **kwargs):
         # By default, we assume that CPU ops are compatible with CUDA ops.
         return self.forward_cuda(*args, **kwargs)
@@ -44,10 +41,9 @@ def forward_tpu(self, *args, **kwargs):
         # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
-    def forward_gaudi(self, *args, **kwargs):
+    def forward_hpu(self, *args, **kwargs):
         # By default, we assume that Gaudi ops are compatible with the
         # PyTorch-native implementation.
-        # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
     def dispatch_forward(self):
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5bfdba67b443d..69f889ed1a1b8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,6 +37,15 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.silu_and_mul(out, x)
         return out
 
+    def forward_hpu(self, x: torch.Tensor) -> torch.Tensor:
+        import vllm.hpu.ops as ops
+
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+    
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 

From aae39b10aa480f5d3b969445c55c192eea8ae610 Mon Sep 17 00:00:00 2001
From: ChenWenbin <wenbin.chen@intel.com>
Date: Mon, 1 Jul 2024 19:16:32 +0800
Subject: [PATCH 059/341] Add alibi support (#69)

Signed-off-by: Wenbin Chen <wenbin.chen@intel.com>
---
 vllm/attention/backends/abstract.py    |  1 +
 vllm/attention/backends/habana_attn.py | 74 ++++++++++++++------------
 vllm/attention/layer.py                |  3 +-
 vllm/hpu/ops.py                        |  7 ++-
 vllm/model_executor/models/mpt.py      |  3 +-
 vllm/worker/habana_model_runner.py     | 43 +++++++--------
 6 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index b2b6e7ac810e3..9024c830c0fcb 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -111,6 +111,7 @@ def __init__(
         num_kv_heads: Optional[int] = None,
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
+        max_seq_len : Optional[int] = 4096,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 017cf9c8933e5..5b31d9fc47ba8 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -136,16 +136,21 @@ def __init__(
         num_kv_heads: Optional[int] = None,
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
+        max_seq_len : Optional[int] = 4096,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
+        self.position_bias = None
         if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.bfloat16)
+            self.position_bias = _make_alibi_bias(alibi_slopes,
+                                                  num_kv_heads,
+                                                  alibi_slopes.dtype,
+                                                  max_seq_len)
         self.alibi_slopes = alibi_slopes
-
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
@@ -199,13 +204,17 @@ def forward(
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # TODO: move this outside of model
                 assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
+                attn_bias = prefill_meta.attn_bias
+                if self.alibi_slopes is not None:
+                    attn_bias.add_(self.position_bias[:, :, -attn_bias.size(2):, -attn_bias.size(3):])
+
                 query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
                 kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
                 out = xops.prompt_attention(
                     query.view(query_shape),
                     key.view(kv_shape),
                     value.view(kv_shape),
-                    attn_bias=prefill_meta.attn_bias,
+                    attn_bias=attn_bias,
                     p=0.0,
                     scale=self.scale,
                 )
@@ -236,10 +245,9 @@ def forward(
                 attn_metadata.kv_cache_dtype,
                 self.num_kv_heads,
                 self.scale,
-                self.alibi_slopes,
+                self.position_bias,
                 kv_scale
             )
-
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
@@ -248,33 +256,29 @@ def _make_alibi_bias(
     alibi_slopes: torch.Tensor,
     num_kv_heads: int,
     dtype: torch.dtype,
-    seq_lens: List[int],
-) -> LowerTriangularMaskWithTensorBias:
-    attn_biases = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        # Calculate a matrix where each element represents ith element- jth
-        # element.
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        num_heads = alibi_slopes.shape[0]
-        bias = torch.empty(
-            1,  # batch size
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias.mul_(alibi_slopes[:, None, None])
-        if num_heads != num_kv_heads:
-            bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
-
-    return attn_biases
+    seq_len: int,
+) -> torch.Tensor:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(seq_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    # Calculate a matrix where each element represents ith element- jth
+    # element.
+    bias = bias[None, :] - bias[:, None]
+
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        1,  # batch size
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    return bias
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index ee7be26c0876c..ec2616c1ab69a 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -29,12 +29,13 @@ def __init__(
         num_kv_heads: Optional[int] = None,
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
+        max_seq_len: Optional[int] = 4096,
     ) -> None:
         super().__init__()
         self.backend = get_attn_backend(torch.get_default_dtype())
         impl_cls = self.backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
-                             alibi_slopes, sliding_window)
+                             alibi_slopes, sliding_window, max_seq_len)
 
     def forward(
         self,
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index b66f6709977c8..51dcff74fab5b 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -36,7 +36,7 @@ def fetch_from_cache(cache, blocks, permutations):
 
 
 @hpu_utils.with_mark_steps
-def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None:
+def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes=None, kv_cache_dtype=None) -> None:
     seq_len = block_tables.size(1)
     batch_size, query_heads, _ = query.shape
     _, _, kv_heads, _ = key_cache.shape
@@ -55,7 +55,10 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block
         mask = mask.unsqueeze(2)
 
     attn_weights = [torch.matmul(query, k) for k in keys]
-    attn_weights = (torch.cat(attn_weights, dim=-1)
+    attn_weights = torch.cat(attn_weights, dim=-1)
+    if alibi_slopes is not None:
+        attn_weights.add_(alibi_slopes[:,:,-attn_weights.size(2):, -attn_weights.size(3):])
+    attn_weights = (attn_weights
                     .masked_fill(mask, min_inf)
                     .softmax(dim=-1))
 
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 6fa5c5bd3014a..c1bebd6e30106 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -107,7 +107,8 @@ def __init__(
                               self.head_dim,
                               scaling,
                               alibi_slopes=alibi_slopes,
-                              num_kv_heads=self.num_kv_heads)
+                              num_kv_heads=self.num_kv_heads,
+                              max_seq_len=config.max_seq_len)
 
     def forward(
         self,
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 1a9206a314d5c..4571eb631e6d7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -115,31 +115,24 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
         prefill_metadata = attn_metadata.prefill_metadata
         if prefill_metadata is None:
             return attn_metadata
-        #FIXME: Restore alibi support
-        #if self.alibi_slopes is None:
-        if True:
-            seq_lens_t = prefill_metadata.seq_lens_tensor
-            len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32)
-                        .view(1, seq_len)
-                        .ge(seq_lens_t.unsqueeze(-1))
-                        .view(batch_size, 1, 1, seq_len))
-            causal_mask = torch.triu(
-                torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool),
-                diagonal=1
-            )
-            mask = causal_mask.logical_or(len_mask)
-            attn_bias = (torch.zeros_like(mask, dtype=dtype)
-                         .masked_fill_(mask, -math.inf))
-            #FIXME: Restore sliding window support
-            #if self.sliding_window is not None:
-            prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
-            attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
-            return attn_metadata
-        else:
-            # FIXME: This needs updating...
-            prefill_meta.attn_bias = _make_alibi_bias(
-                self.alibi_slopes, self.num_kv_heads, batch_size,
-                seq_len, query.dtype)
+
+        seq_lens_t = prefill_metadata.seq_lens_tensor
+        len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32)
+                    .view(1, seq_len)
+                    .ge(seq_lens_t.unsqueeze(-1))
+                    .view(batch_size, 1, 1, seq_len))
+        causal_mask = torch.triu(
+            torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool),
+            diagonal=1
+        )
+        mask = causal_mask.logical_or(len_mask)
+        attn_bias = (torch.zeros_like(mask, dtype=dtype)
+                      .masked_fill_(mask, -math.inf))
+        #FIXME: Restore sliding window support
+        #if self.sliding_window is not None:
+        prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias)
+        attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata)
+        return attn_metadata
 
 
     def forward(self, *args, **kwargs):

From 0e63941c1f84eec0ad7d398e54838a1a658fe9ef Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 15:28:29 +0300
Subject: [PATCH 060/341] move xops to ops

---
 vllm/attention/backends/habana_attn.py |  9 +--
 vllm/hpu/ops.py                        | 91 +++++++++-----------------
 vllm/hpu/xops.py                       | 41 ------------
 3 files changed, 34 insertions(+), 107 deletions(-)
 delete mode 100644 vllm/hpu/xops.py

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 7d5fb5146cc2f..5184a4d9d4c44 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -6,10 +6,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
-import math
-import vllm.hpu.xops as xops
-from vllm.hpu.attn_bias import (AttentionBias,
-                                LowerTriangularMaskWithTensorBias)
+import vllm.hpu.ops as ops
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata)
@@ -107,7 +104,7 @@ def __post_init__(self):
         # when alibi slopes is used. It is because of the limitation
         # from xformer API.
         # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[AttentionBias]] = None
+        self.attn_bias: Optional[List[torch.Tensor]] = None
 
 
 class HabanaAttentionImpl(AttentionImpl):
@@ -203,7 +200,7 @@ def forward(
                 assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
                 query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
                 kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
-                out = xops.prompt_attention(
+                out = ops.prompt_attention(
                     query.view(query_shape),
                     key.view(kv_shape),
                     value.view(kv_shape),
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index c91f8c6a86afe..cecdb7cc67d43 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -10,7 +10,7 @@
 import torch.nn.functional as F
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.utils.experimental as htexp
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import vllm.hpu.utils as hpu_utils
 
@@ -26,25 +26,6 @@ def silu_and_mul(output, input):
     output.copy_(silu(x) * y)
 
 
-def gelu_and_mul(output, input):
-    raise NotImplementedError("gelu_and_mul is not implemented for HPU backend")
-
-
-def gelu_tanh_and_mul(output, input):
-    raise NotImplementedError("gelu_tanh_and_mul is not implemented for HPU backend")
-
-
-def gelu_new(output, input):
-    raise NotImplementedError("gelu_new is not implemented for HPU backend")
-
-
-def gelu_fast(output, input):
-    raise NotImplementedError("gelu_fast is not implemented for HPU backend")
-
-
-def gelu_quick(output, input):
-    raise NotImplementedError("gelu_quick is not implemented for HPU backend")
-
 
 def fetch_from_cache(cache, blocks, permutations):
     return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))]
@@ -89,46 +70,6 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block
     return attn_weights.squeeze(-2)
 
 
-def rms_norm(out, hidden_states, weight, eps):
-    htorch.core.mark_step()
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + eps)
-    out.copy_(weight * hidden_states.to(input_dtype))
-    htorch.core.mark_step()
-
-
-def rotate_neox(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., ::2]
-    x2 = x[..., 1::2]
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-def apply_rope(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-    is_neox_style: bool,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    rotate_fn = rotate_neox if is_neox_style else rotate_gptj
-    q_embed = (q * cos) + (rotate_fn(q) * sin)
-    k_embed = (k * cos) + (rotate_fn(k) * sin)
-    return q_embed, k_embed
-
-
-def awq_gemm(*args):
-    raise NotImplementedError
-
-
 def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
@@ -163,3 +104,33 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
         final_hidden_states += current_hidden_states_static
 
     return final_hidden_states.view(-1, D)
+
+
+@hpu_utils.with_mark_steps
+def prompt_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_bias: Optional[torch.Tensor] = None,
+        p: float = 0.0,
+        scale: Optional[float] = None,
+) -> torch.Tensor:
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    query_heads = query.size(1)
+    kv_heads = key.size(1)
+    if query_heads != kv_heads:
+        query = query.unflatten(1, (kv_heads, -1))
+        key = key.unflatten(1, (kv_heads, 1))
+        value = value.unflatten(1, (kv_heads, 1))
+        attn_bias = attn_bias.unsqueeze(2)
+    attn_weights = torch.matmul(query * scale, key.transpose(-1, -2))
+    if attn_bias is not None:
+        attn_weights.add_(attn_bias)
+    attn_weights = torch.softmax(attn_weights, dim=-1)
+    attn_weights = torch.matmul(attn_weights, value)
+    if query_heads != kv_heads:
+        attn_weights = attn_weights.flatten(1, 2)
+    attn_weights = attn_weights.transpose(1, 2)
+    return attn_weights
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
deleted file mode 100644
index d6404a4872c0d..0000000000000
--- a/vllm/hpu/xops.py
+++ /dev/null
@@ -1,41 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-import torch
-from typing import Optional
-
-import vllm.hpu.utils
-
-
-@vllm.hpu.utils.with_mark_steps
-def prompt_attention(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_bias: Optional[torch.Tensor] = None,
-        p: float = 0.0,
-        scale: Optional[float] = None,
-) -> torch.Tensor:
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-    query_heads = query.size(1)
-    kv_heads = key.size(1)
-    if query_heads != kv_heads:
-        query = query.unflatten(1, (kv_heads, -1))
-        key = key.unflatten(1, (kv_heads, 1))
-        value = value.unflatten(1, (kv_heads, 1))
-        attn_bias = attn_bias.unsqueeze(2)
-    attn_weights = torch.matmul(query * scale, key.transpose(-1, -2))
-    if attn_bias is not None:
-        attn_weights.add_(attn_bias)
-    attn_weights = torch.softmax(attn_weights, dim=-1)
-    attn_weights = torch.matmul(attn_weights, value)
-    if query_heads != kv_heads:
-        attn_weights = attn_weights.flatten(1, 2)
-    attn_weights = attn_weights.transpose(1, 2)
-    return attn_weights

From 0141d5751076a9b0e5040a551b10d3150a79ae59 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 15:38:58 +0300
Subject: [PATCH 061/341] remove vllm/hpu/attn_bias.py

---
 vllm/hpu/attn_bias.py | 764 ------------------------------------------
 1 file changed, 764 deletions(-)
 delete mode 100644 vllm/hpu/attn_bias.py

diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py
deleted file mode 100644
index ff508a59cc56a..0000000000000
--- a/vllm/hpu/attn_bias.py
+++ /dev/null
@@ -1,764 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import math
-from dataclasses import dataclass
-from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union
-
-import torch
-
-
-class AttentionBias:
-    """Base class for a custom bias that can be applied \
-        as the attn_bias argument in
-        :attr:`xformers.ops.memory_efficient_attention`.
-
-    That function has the ability to add a tensor, the
-    attention bias, to the QK^T matrix before it is used
-    in the softmax part of the attention calculation.
-    The attention bias tensor with shape
-    (B or 1, n_queries, number of keys)
-    can be given as the attn_bias input.
-    The most common use case is for an attention bias is
-    to contain only zeros and negative infinities, which forms
-    a mask so that some queries only attend to some keys.
-
-    Children of this class define alternative things which can
-    be used as the attn_bias input to define an attention bias which
-    forms such a mask, for some common cases.
-
-    When using an :attr:`xformers.ops.AttentionBias`
-    instead of a :attr:`torch.Tensor`, the mask matrix does
-    not need to be materialized, and can be
-    hardcoded into some kernels for better performance.
-
-    See:
-
-    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask`
-    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias`
-    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`
-    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`
-
-    """
-
-    def materialize(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        """
-        Materializes the bias as a `torch.Tensor`. This is very slow
-        and we don't attempt to make it fast. Only use for debugging/testing.
-
-        Shape should be like `[*, q_seqlen, k_seqlen]`
-        """
-        raise NotImplementedError()
-
-
-class LowerTriangularMask(AttentionBias):
-    """
-    A lower-triangular (aka causal) mask
-
-    A query Q cannot attend to a key which is farther from the
-    initial key than Q is from the initial query.
-    """
-
-    def __init__(self, *tensor_args, **tensor_kwargs) -> None:
-        # NOTE: Unused arguments, we keep them for backward compatibility
-        super().__init__()
-
-    def materialize(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
-        tensor = torch.full(  # type: ignore
-            shape,
-            dtype=create_as,
-            fill_value=float("-inf"),
-            device=device,
-        )
-        return torch.triu(tensor, diagonal=1).to(dtype)  # type: ignore
-
-    def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias":
-        return LowerTriangularMaskWithTensorBias(bias)
-
-
-class LowerTriangularMaskWithTensorBias(LowerTriangularMask):
-    """A lower-triangular (aka causal) mask with an additive bias"""
-
-    def __init__(self, bias: torch.Tensor) -> None:
-        self._bias = bias
-
-    def materialize(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        return super().materialize(shape, dtype=dtype, device=device) + self._bias
-
-
-@dataclass
-class _SeqLenInfo:
-    """
-    (Internal) Represents the division of a dimension into blocks.
-
-    For example, to represents a dimension of length 7 divided into
-    three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`.
-    The members will be:
-        max_seqlen: 3
-        min_seqlen: 2
-        seqstart_py: [0, 2, 5, 7]
-        seqstart: torch.IntTensor([0, 2, 5, 7])
-    """
-
-    seqstart: torch.Tensor
-    max_seqlen: int
-    min_seqlen: int
-    seqstart_py: List[int]
-
-    def to(self, device: torch.device) -> None:
-        self.seqstart = self.seqstart.to(device, non_blocking=True)
-
-    def intervals(self) -> Iterable[Tuple[int, int]]:
-        yield from zip(self.seqstart_py, self.seqstart_py[1:])
-
-    @classmethod
-    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
-        """
-        Input tensors are assumed to be in shape [B, M, *]
-        """
-        assert not isinstance(seqlens, torch.Tensor)
-        seqstart_py = [0]
-        max_seqlen = -1
-        min_seqlen = -1
-        for seqlen in seqlens:
-            min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen
-            max_seqlen = max(max_seqlen, seqlen)
-            seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen)
-        seqstart = torch.tensor(seqstart_py, dtype=torch.int32)
-        return cls(
-            max_seqlen=max_seqlen,
-            min_seqlen=min_seqlen,
-            seqstart=seqstart,
-            seqstart_py=seqstart_py,
-        )
-
-    def split(
-        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
-    ) -> List[torch.Tensor]:
-        if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1:
-            raise ValueError(
-                f"Invalid `torch.Tensor` of shape {x.shape}, expected format "
-                f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n"
-                f" seqstart: {self.seqstart_py}"
-            )
-        if batch_sizes is None:
-            batch_sizes = [1] * (len(self.seqstart_py) - 1)
-        split_chunks = []
-        it = 0
-        for batch_size in batch_sizes:
-            split_chunks.append(
-                self.seqstart_py[it + batch_size] - self.seqstart_py[it]
-            )
-            it += batch_size
-        return [
-            tensor.reshape([bs, -1, *tensor.shape[2:]])
-            for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1))
-        ]
-
-
-@dataclass
-class _PaddedSeqLenInfo(_SeqLenInfo):
-    """
-    (Internal)  Represents the division of a dimension into blocks which are
-    padded out to the same total length.
-
-    For example, to represent a dimension of length 12 with space for
-    three blocks of length 4, but where the occupied lengths are
-    2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`.
-
-    The layout along the dimension is
-
-     0 ─►  block 0
-           block 0
-           <space>
-           <space>
-     4 ─►  block 1
-           block 1
-           block 1
-           <space>
-     8 ─►  block 2
-           block 2
-           <space>
-           <space>
-    12 ─►
-
-    The members will be:
-        max_seqlen: 3
-        min_seqlen: 2
-        seqstart_py: [0, 4, 8, 12]
-        seqstart: torch.IntTensor([0, 4, 8, 12])
-        seqlen_py: [2, 3, 2]
-        seqlen: torch.IntTensor([2, 3, 2])
-        padding: 4
-    """
-
-    seqlen: torch.Tensor
-    seqlen_py: Sequence[int]
-    padding: int
-    # From parent: seqstart[i] contains the start position
-    # of the i-th sequence
-    # seqstart: torch.Tensor
-
-    def __post_init__(self) -> None:
-        assert len(self.seqstart_py) == len(self.seqlen_py) + 1
-
-    def to(self, device: torch.device) -> None:
-        self.seqlen = self.seqlen.to(device, non_blocking=True)
-        super().to(device)
-
-    def intervals(self) -> Iterable[Tuple[int, int]]:
-        for (start, _), length in zip(super().intervals(), self.seqlen_py):
-            yield start, start + length
-
-    @classmethod
-    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
-        raise RuntimeError(
-            "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`"
-        )
-
-    @classmethod
-    def from_seqlens_padded(
-        cls, seqlens: Sequence[int], padding: int
-    ) -> "_PaddedSeqLenInfo":
-        """
-        Input tensors are assumed to be in shape [B, M, *]
-        seqstart = padding * torch.arange(batch_size)
-        """
-        assert not isinstance(seqlens, torch.Tensor)
-        assert all(seqlen <= padding for seqlen in seqlens)
-        seqstart_py = list(range(0, len(seqlens) * padding + 1, padding))
-        return cls(
-            seqlen=torch.tensor(seqlens, dtype=torch.int32),
-            seqlen_py=seqlens,
-            max_seqlen=max(seqlens),
-            min_seqlen=min(seqlens),
-            seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
-            seqstart_py=seqstart_py,
-            padding=padding,
-        )
-
-    def split(
-        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
-    ) -> List[torch.Tensor]:
-        raise NotImplementedError("_PaddedSeqLenInfo.split")
-
-
-@dataclass
-class BlockDiagonalMask(AttentionBias):
-    """
-    A block-diagonal mask that can be passed as ``attn_bias``
-    argument to :attr:`xformers.ops.memory_efficient_attention`.
-
-    Queries and Keys are each divided into the same number of blocks.
-    Queries in block i only attend to keys in block i.
-
-    .. figure:: /_static/block_diag_bias.png
-
-        This bias can be used to handle a batch of sequences of
-        different lengths, via :attr:`BlockDiagonalMask.from_tensor_list`
-
-    :Example:
-
-    .. code-block:: python
-
-        import torch
-        from xformers.ops import fmha
-
-        K = 16
-        dtype = torch.float16
-        device = "cuda"
-        list_x = [
-            torch.randn([1, 3, 1, K], dtype=dtype, device=device),
-            torch.randn([1, 6, 1, K], dtype=dtype, device=device),
-            torch.randn([1, 2, 1, K], dtype=dtype, device=device),
-        ]
-        attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
-        linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)
-
-        q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
-        out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
-        list_out = attn_bias.split(out)
-        print(list_out[0].shape)  # [1, 3, 1, K]
-        assert tuple(list_out[0].shape) == (1, 3, 1, K)
-
-    """
-
-    q_seqinfo: _SeqLenInfo
-    k_seqinfo: _SeqLenInfo
-    _batch_sizes: Optional[Sequence[int]] = None
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        return torch.zeros(
-            shape,
-            dtype=dtype,
-            device=device,
-        )
-
-    def materialize(
-        self,
-        shape: Optional[Tuple[int, ...]] = None,
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        """Materialize the attention bias - for debugging & testing"""
-        if shape is None:
-            shape = (self.q_seqinfo.seqstart_py[-1],
-                     self.k_seqinfo.seqstart_py[-1])
-        assert shape[-1] == self.k_seqinfo.seqstart_py[-1], (
-            shape[-1],
-            self.k_seqinfo.seqstart_py[-1],
-        )
-        assert shape[-2] == self.q_seqinfo.seqstart_py[-1], (
-            shape[-2],
-            self.q_seqinfo.seqstart_py[-1],
-        )
-        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
-        mask.fill_(-math.inf)
-        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
-            zip(
-                self.q_seqinfo.intervals(),
-                self.k_seqinfo.intervals(),
-            )
-        ):
-            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
-                (q_end - q_start, k_end - k_start),
-                dtype=dtype,
-                device=device,
-            )
-        for _ in range(len(shape) - 2):
-            mask = mask.unsqueeze(0)
-        return mask.expand(shape)
-
-    @classmethod
-    def from_seqlens(
-        cls,
-        q_seqlen: Sequence[int],
-        kv_seqlen: Optional[Sequence[int]] = None,
-    ) -> "BlockDiagonalMask":
-        """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value.
-
-        Args:
-            q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors
-            kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value.
-                    (Defaults to ``q_seqlen``.)
-        Returns:
-            BlockDiagonalMask
-        """
-        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen)
-        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
-        if kv_seqlen is None or q_seqlen == kv_seqlen:
-            k_seqinfo = q_seqinfo
-        else:
-            k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen)
-        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
-
-    @classmethod
-    def from_tensor_list(
-        cls,
-        tensors: Sequence[torch.Tensor],
-    ) -> Tuple["BlockDiagonalMask", torch.Tensor]:
-        """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors
-        concatenated on the sequence length dimension
-
-        .. figure:: /_static/block_diag_cat_split.png
-
-            See also :attr:`BlockDiagonalMask.split` to split the returned
-            :attr:`torch.Tensor` back to a list of tensors of varying sequence length
-
-        Args:
-            tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``.
-                All tensors should have the same dimension and the same batch size ``B``, but
-                they can have different sequence length ``M``.
-
-        Returns:
-            Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention
-            along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]``
-        """
-        batch_sizes = [tensor.shape[0] for tensor in tensors]
-        seqlens = []
-        for x in tensors:
-            for _ in range(x.shape[0]):
-                seqlens.append(x.shape[1])
-        block_diag = cls.from_seqlens(seqlens)
-        block_diag._batch_sizes = batch_sizes
-        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors)
-        concat_tensors = torch.cat(tensors_bs1, dim=1)
-        return block_diag, concat_tensors
-
-    @classmethod
-    def from_tensor_lists_qkv(
-        cls,
-        tensors_q: Sequence[torch.Tensor],
-        tensors_k: Sequence[torch.Tensor],
-        tensors_v: Optional[Sequence[torch.Tensor]] = None,
-    ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        assert len(tensors_q) == len(tensors_k)
-        assert tensors_v is None or len(tensors_v) == len(tensors_q)
-        batch_sizes = [tensor.shape[0] for tensor in tensors_q]
-        q_seqlens, kv_seqlens = [], []
-        for i, (q, k) in enumerate(zip(tensors_q, tensors_k)):
-            assert q.shape[0] == k.shape[0]
-            q_seqlens += [q.shape[1]] * q.shape[0]
-            kv_seqlens += [k.shape[1]] * k.shape[0]
-            assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2]
-        block_diag = cls.from_seqlens(q_seqlens, kv_seqlens)
-        block_diag._batch_sizes = batch_sizes
-        return (
-            block_diag,
-            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1),
-            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1),
-            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1)
-            if tensors_v is not None
-            else None,
-        )
-
-    def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
-        return self.q_seqinfo.split(tensor, self._batch_sizes)
-
-    def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
-        return self.k_seqinfo.split(tensor, self._batch_sizes)
-
-    def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
-        """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list`
-
-        Args:
-            tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]``
-
-        Returns:
-            Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths
-        """
-        assert self.q_seqinfo is self.k_seqinfo
-        return self.q_seqinfo.split(tensor, self._batch_sizes)
-
-    def make_causal(self) -> "BlockDiagonalCausalMask":
-        """Makes each block causal"""
-        return BlockDiagonalCausalMask(
-            q_seqinfo=self.q_seqinfo,
-            k_seqinfo=self.k_seqinfo,
-            _batch_sizes=self._batch_sizes,
-        )
-
-    def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask":
-        """Makes each block causal with a possible non-causal prefix"""
-        return BlockDiagonalCausalFromBottomRightMask(
-            q_seqinfo=self.q_seqinfo,
-            k_seqinfo=self.k_seqinfo,
-            _batch_sizes=self._batch_sizes,
-        )
-
-    def make_local_attention(
-        self, window_size: int
-    ) -> "BlockDiagonalCausalLocalAttentionMask":
-        """Experimental: Makes each block causal with local attention"""
-        return BlockDiagonalCausalLocalAttentionMask(
-            q_seqinfo=self.q_seqinfo,
-            k_seqinfo=self.k_seqinfo,
-            _batch_sizes=self._batch_sizes,
-            _window_size=window_size,
-        )
-
-    def make_local_attention_from_bottomright(
-        self, window_size: int
-    ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask":
-        """Experimental: Makes each block causal with local attention, start from bottom right"""
-        return BlockDiagonalCausalLocalAttentionFromBottomRightMask(
-            q_seqinfo=self.q_seqinfo,
-            k_seqinfo=self.k_seqinfo,
-            _batch_sizes=self._batch_sizes,
-            _window_size=window_size,
-        )
-
-
-@dataclass
-class BlockDiagonalCausalMask(BlockDiagonalMask):
-    """
-    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
-
-    Queries and Keys are each divided into the same number of blocks.
-    A query Q in block i cannot attend to a key which is not in block i,
-    nor one which is farther from the initial key in block i than Q
-    is from the initial query in block i.
-    """
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        return LowerTriangularMask().materialize(
-            shape,
-            dtype=dtype,
-            device=device,
-        )
-
-
-@dataclass
-class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask):
-    """
-    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
-    This mask allows for a non-causal prefix
-    NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not
-    defined (softmax of vector of `-inf` in the attention)
-
-    Queries and keys are each divided into the same number of blocks.
-    A query Q in block i cannot attend to a key which is not in block i,
-    nor one which nearer the final key in block i than Q is to the
-    final query in block i.
-    """
-
-    def __post_init__(self) -> None:
-        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
-            zip(
-                self.q_seqinfo.intervals(),
-                self.k_seqinfo.intervals(),
-            )
-        ):
-            num_queries = q_end - q_start
-            num_keys = k_end - k_start
-            if num_keys < num_queries:
-                raise ValueError(
-                    f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}."
-                    " Expected `num_keys >= num_queries`"
-                )
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
-        tensor = torch.full(  # type: ignore
-            shape,
-            dtype=create_as,
-            fill_value=float("-inf"),
-            device=device,
-        )
-        num_queries, num_keys = shape[-2:]
-        return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype)  # type: ignore
-
-
-@dataclass
-class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias):
-    """
-    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`,
-    except an offset on causality is allowed for each block and we support padding for k/v
-
-    The keys and values are divided into blocks which are padded out to
-    the same total length.
-    For example, if there is space for 12 keys, for three blocks of
-    max length 4, but we only want to use the first 2, 3 and 2
-    of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`.
-    The queries are divided into blocks, without padding, of lengths given by
-    q_seqlen.
-
-    A query Q in block i cannot attend to a key which is not in block i,
-    nor one which is not in use (i.e. in the padded area),
-    nor one which is nearer to the final key in block i
-    than Q is to the final query in block i.
-    """
-
-    q_seqinfo: _SeqLenInfo
-    k_seqinfo: _PaddedSeqLenInfo
-    causal_diagonal: Any = None  # unused. Exists for BC only.
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
-        tensor = torch.full(  # type: ignore
-            shape,
-            dtype=create_as,
-            fill_value=float("-inf"),
-            device=device,
-        )
-        num_queries, num_keys = shape[-2:]
-        return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype)  # type: ignore
-
-    def materialize(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        """Materialize the attention bias - for debugging & testing"""
-        if shape[-1] != self.k_seqinfo.seqstart_py[-1]:
-            raise ValueError("k shapes wrong")
-        if shape[-2] != self.q_seqinfo.seqstart_py[-1]:
-            raise ValueError("q shapes wrong")
-        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
-        mask.fill_(-math.inf)
-        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
-            zip(
-                self.q_seqinfo.intervals(),
-                self.k_seqinfo.intervals(),
-            )
-        ):
-            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
-                (q_end - q_start, k_end - k_start),
-                dtype=dtype,
-                device=device,
-            )
-        for _ in range(len(shape) - 2):
-            mask = mask.unsqueeze(0)
-        return mask.expand(shape)
-
-    @classmethod
-    def from_seqlens(
-        cls,
-        q_seqlen: Sequence[int],
-        kv_padding: int,
-        kv_seqlen: Sequence[int],
-        causal_diagonal: Any = None,
-    ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask":
-        """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor
-        lengths for query and key/value.
-
-        Args:
-            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
-            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
-            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
-            causal_diagonal: unused, for BC only
-        Returns:
-            BlockDiagonalCausalWithOffsetPaddedKeysMask
-        """
-        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), (
-            q_seqlen,
-            kv_seqlen,
-        )
-        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
-        k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding)
-        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
-
-
-@dataclass
-class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask):
-    """
-    (Experimental feature)
-    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
-    This makes the mask "local" and the attention pattern banded.
-
-    Query i only attends to keys in its block and cannot attend keys further than "window_size"
-    from it.
-    """
-
-    _window_size: int = 0  # forced due to inheritance and default arguments
-
-    def __post_init__(self):
-        if self._window_size <= 0:
-            raise ValueError(
-                f"Expected `window_size > 0`, but window_size={self._window_size}"
-            )
-        q_seqlen = [
-            y - x
-            for x, y in zip(
-                self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:]
-            )
-        ]
-        kv_seqlen = [
-            y - x
-            for x, y in zip(
-                self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:]
-            )
-        ]
-        for q, k in zip(q_seqlen, kv_seqlen):
-            if q - self._window_size >= k:
-                # Each query only attends to keys no further than window_size back.
-                # When q > k + window_size, there will be a query for which the window doesn't reach any key.
-                raise RuntimeError(
-                    f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}"
-                )
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
-        tensor = torch.full(  # type: ignore
-            shape,
-            dtype=create_as,
-            fill_value=1,
-            device=device,
-        )
-
-        num_queries, num_keys = shape[-2:]
-        mask = torch.tril(tensor, diagonal=0).to(dtype)  # type: ignore
-        if self._window_size is not None and self._window_size > 0:
-            mask = torch.triu(mask, diagonal=-self._window_size + 1)
-        mask = torch.log(mask)
-        return mask.to(dtype)
-
-
-@dataclass
-class BlockDiagonalCausalLocalAttentionFromBottomRightMask(
-    BlockDiagonalCausalFromBottomRightMask
-):
-    """
-    (Experimental feature)
-    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
-    This makes the mask "local" and the attention pattern banded.
-
-    Query i only attends to keys in its block and cannot attend keys further than "window_size"
-    from it.
-    """
-
-    _window_size: int = 0  # forced due to inheritance and default arguments
-
-    def __post_init__(self):
-        super().__post_init__()
-        if self._window_size <= 0:
-            raise ValueError(
-                f"Expected `window_size > 0`, but window_size={self._window_size}"
-            )
-
-    def _create_block_mask(
-        self,
-        shape: Tuple[int, ...],
-        dtype: torch.dtype = torch.float32,
-        device: Union[str, torch.device] = "cpu",
-    ) -> torch.Tensor:
-        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
-        tensor = torch.full(  # type: ignore
-            shape,
-            dtype=create_as,
-            fill_value=1,
-            device=device,
-        )
-        num_queries, num_keys = shape[-2:]
-        mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype)  # type: ignore
-        if self._window_size is not None:
-            mask = torch.triu(
-                mask, diagonal=num_keys - num_queries - self._window_size + 1
-            )
-        mask = torch.log(mask)
-        return mask.to(dtype)

From 90f900cbbd41e7f880d9c6ac4423fd4379ff1c31 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 15:21:50 +0200
Subject: [PATCH 062/341] Remove allgather workaround in logits_processor (#76)

---
 vllm/model_executor/layers/logits_processor.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 3951619c6e3ec..57b6c7f907ae2 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -4,9 +4,9 @@
 import torch
 import torch.nn as nn
 
-from vllm.distributed import tensor_model_parallel_gather, tensor_model_parallel_all_gather
+from vllm.distributed import tensor_model_parallel_gather
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.utils import is_hpu
+
 
 class LogitsProcessor(nn.Module):
     """Process logits and apply logits processors from sampling metadata.
@@ -50,9 +50,7 @@ def forward(
             # Get the logits for the next tokens.
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
-        # NOTE(kzawora): allgather on HPU will cause logits to be not None, 
-        # and we need to guard against applying logits processors on non-driver worker
-        if logits is not None and sampling_metadata.seq_groups is not None:
+        if logits is not None:
             logits *= self.scale
 
             # Apply logits processors (if any).
@@ -66,9 +64,7 @@ def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
         logits = torch.matmul(hidden_states, embedding.t())
         if embedding_bias is not None:
             logits += embedding_bias
-        # NOTE(kzawora): HPU PT bridge is missing support for single-rank gather. We'll use all-gather on Gaudi for now.
-        gather_op = tensor_model_parallel_all_gather if is_hpu() else tensor_model_parallel_gather
-        logits = gather_op(logits)
+        logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[:, :self.org_vocab_size]

From a21fe62dcb3cbdf8aba8758687c5cdcd209f6b0e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 16:28:07 +0300
Subject: [PATCH 063/341] whitespace fix

---
 vllm/model_executor/layers/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 7028936c54b03..39d142b158445 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -53,7 +53,7 @@ def forward(
 
             # Get the logits for the next tokens.
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
-        
+
         if logits is not None:
             if self.soft_cap is not None:
                 logits = logits / self.soft_cap

From aaf544633b8d7c9d8dc48ee9c70afa8e726d71df Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 16:31:40 +0300
Subject: [PATCH 064/341] revert accidental changes in rmsnorm

---
 vllm/model_executor/layers/layernorm.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index f992dfc64fa80..d0d1577b26a10 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -116,13 +116,6 @@ def forward_xpu(
         from vllm._ipex_ops import ipex_ops as ops
 
         if residual is not None:
-            if x.device.type == "hpu" and FusedRMSNorm:
-                orig_dtype = x.dtype
-                orig_shape = x.shape
-                residual += x.view(residual.shape)
-                # Note: FusedRMSNorm requires 3D tensors as inputs
-                x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon)
-                return x.to(orig_dtype).view(orig_shape), residual
             ops.fused_add_rms_norm(
                 x,
                 residual,
@@ -130,10 +123,6 @@ def forward_xpu(
                 self.variance_epsilon,
             )
             return x, residual
-        if x.device.type == "hpu" and FusedRMSNorm:
-            orig_dtype = x.dtype
-            x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon)
-            return x.to(orig_dtype)
         out = torch.empty_like(x)
         ops.rms_norm(
             out,
@@ -143,7 +132,6 @@ def forward_xpu(
         )
         return out
 
-
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"
         s += f", eps={self.variance_epsilon}"

From 1ec95c405cbf7eee1f6dcb63cf0b55cf0afb5486 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 17:20:37 +0300
Subject: [PATCH 065/341] Fix hpugraph hashing

---
 vllm/attention/backends/habana_attn.py |  6 +++---
 vllm/worker/habana_model_runner.py     | 11 +++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 661bed749679d..3c8aebad976b7 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -196,9 +196,9 @@ def forward(
                                                       value_cache,
                                                       attn_metadata.slot_mapping,
                                                       self.kv_cache_dtype,
-                                                      attn_metadata.num_prefills > 0)
+                                                      attn_metadata.is_prompt)
 
-        if attn_metadata.num_prefills > 0:
+        if attn_metadata.is_prompt:
             # Prompt run.
             if kv_cache is None or attn_metadata.block_tables.numel() == 0:
                 # TODO: move this outside of model
@@ -233,7 +233,7 @@ def forward(
                     attn_metadata.max_query_len,
                     self.alibi_slopes,
                 )
-        if attn_metadata.num_decode_tokens > 0:
+        else:
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
                 query,
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 9cdf951fed6ee..e53350ecfc1fc 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -14,7 +14,7 @@
 import operator
 import torch
 import habana_frameworks.torch as htorch
-
+import contextlib
 from vllm.attention import (AttentionMetadata, get_attn_backend)
 from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VisionLanguageConfig)
@@ -522,7 +522,7 @@ def _prepare_prompt(
             num_prefills=real_num_seqs,
             num_prefill_tokens=sum_query_len,
             num_decode_tokens=0, 
-            slot_mapping=slot_mapping
+            slot_mapping=slot_mapping,
         )
         return PreparePromptMetadata(
             input_tokens=input_tokens,
@@ -625,7 +625,7 @@ def _prepare_decode(
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=num_decode_tokens,
-            slot_mapping=slot_mapping
+            slot_mapping=slot_mapping,
         )
         return PrepareDecodeMetadata(
             input_tokens=input_tokens,
@@ -808,9 +808,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
                                     ['block_tables',
                                      'seq_lens_tensor',
                                      'attn_bias',
-                                     'num_prefills',
-                                     'num_decode_tokens',
-                                     'slot_mapping'])
+                                     'slot_mapping',
+                                     'is_prompt'])
         return prefill_metadata
 
     @torch.inference_mode()

From 2394c41b9d03b80fe43534aeca2b66408ea78e02 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 18:57:41 +0300
Subject: [PATCH 066/341] add trim_attn_metadata comment

---
 vllm/worker/habana_model_runner.py | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e53350ecfc1fc..49f66ae1e0863 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -803,14 +803,34 @@ def _seq_len(self, attn_metadata):
             return attn_metadata.block_tables.size(1) * self.block_size
 
     def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
-        prefill_metadata = subtuple(metadata,
-                                    'TrimmedAttentionMetadata',
-                                    ['block_tables',
-                                     'seq_lens_tensor',
-                                     'attn_bias',
-                                     'slot_mapping',
-                                     'is_prompt'])
-        return prefill_metadata
+        # NOTE(kzawora): To anyone working on this in the future:
+        # Trimming metadata is required when using HPUGraphs.
+        # Attention metadata is going to be hashed by PT bridge, and
+        # appropriate HPUGraphs will be matched based on all inputs' hash.
+        
+        # Before you put more keys in here, make sure you know their 
+        # value type and make sure you know how it's going to be hashed. 
+        # You can find that information in input_hash function 
+        # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+        # it manually with torch.hpu.graphs.input_hash(attention_metadata)
+        
+        # If you use primitive types here - they will get hashed based
+        # on their value. You *will* get lots of excessive graph captures
+        # (and an OOM eventually) if you decide to put something like
+        # seq_len int here. 
+        # If you absolutely need a scalar, put it in a tensor. Tensors 
+        # get hashed using their metadata, not their values:
+        # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+        # input_hash(123) != input_hash(321)
+        # input_hash("abc") != input_hash("cba")
+        attention_metadata = subtuple(metadata,
+                                      'TrimmedAttentionMetadata',
+                                      ['block_tables',
+                                       'seq_lens_tensor',
+                                       'attn_bias',
+                                       'slot_mapping',
+                                       'is_prompt'])
+        return attention_metadata
 
     @torch.inference_mode()
     def execute_model(

From 98fb698c6c6140cca5f56283b3313460af270914 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 1 Jul 2024 19:26:34 +0300
Subject: [PATCH 067/341] fix prompt bucketing:

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 49f66ae1e0863..21d34a3924c86 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -854,7 +854,7 @@ def execute_model(
             (input_tokens, input_positions, attn_metadata, sampling_metadata,
             lora_requests, lora_mapping, multi_modal_input
             ) = self.prepare_input_tensors(seq_group_metadata_list)
-            is_prompt = attn_metadata.prefill_metadata is not None
+            is_prompt = attn_metadata.is_prompt
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)

From d99d9862a960668bbcd3d8af3554587db82d42ef Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 16:21:24 +0300
Subject: [PATCH 068/341] guard model loader wa for hpu

---
 vllm/model_executor/model_loader/loader.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 48060481b2ee2..cade78114be42 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import get_device_capability_stateless, is_tpu
+from vllm.utils import get_device_capability_stateless, is_tpu, is_hpu
 
 logger = init_logger(__name__)
 
@@ -262,7 +262,8 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device('cpu'): # FIXME(kzawora): this is a nasty workaround!!!
+            load_device = torch.device(device_config.device) if not is_hpu() else 'cpu' # FIXME(kzawora): this is a nasty workaround!!!
+            with torch.device(load_device):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, vision_language_config,
                                           cache_config)
@@ -282,7 +283,8 @@ def load_model(self, *, model_config: ModelConfig,
                 # to use quant_method.
                 if hasattr(module, "process_weights_after_loading"):
                     module.process_weights_after_loading()
-        model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!!
+        if is_hpu():
+            model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!!
         return model.eval()
 
 
From 235a7bf749dce23295288fc4a28cf12d5ba6a6a4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 18:10:04 +0300
Subject: [PATCH 069/341] Make mypy happy

---
 pyproject.toml                                |  73 ++++
 setup.py                                      |  23 +-
 vllm/attention/backends/abstract.py           |   2 +-
 vllm/attention/backends/habana_attn.py        |  56 ++-
 vllm/attention/ops/habana_paged_attn.py       |  28 +-
 vllm/attention/selector.py                    |   2 +-
 vllm/distributed/communication_op.py          |   3 +-
 vllm/executor/habana_executor.py              |  66 +++-
 vllm/executor/ray_habana_executor.py          |   4 -
 vllm/executor/ray_utils.py                    |  15 +-
 vllm/hpu/cache_ops.py                         |  26 +-
 vllm/hpu/ops.py                               |  66 ++--
 vllm/hpu/rotary_embed.py                      |  79 ++--
 vllm/hpu/utils.py                             |   5 +-
 vllm/model_executor/layers/activation.py      |   2 +-
 vllm/model_executor/layers/layernorm.py       |   7 +-
 .../model_executor/layers/logits_processor.py |   4 +-
 .../model_executor/layers/rotary_embedding.py |   9 +-
 vllm/model_executor/model_loader/loader.py    |   6 +-
 vllm/model_executor/models/mixtral.py         |  15 +-
 vllm/model_executor/sampling_metadata.py      |   6 +-
 vllm/utils.py                                 |  67 ++--
 vllm/worker/cache_engine.py                   |   8 +-
 vllm/worker/habana_model_runner.py            | 338 ++++++++++++------
 vllm/worker/habana_worker.py                  |  48 ++-
 vllm/worker/profiler.py                       |   6 +-
 26 files changed, 633 insertions(+), 331 deletions(-)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000..790e013620286
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,73 @@
+[build-system]
+# Should be mirrored in requirements-build.txt
+requires = [
+    "cmake>=3.21",
+    "ninja",
+    "packaging",
+    "setuptools >= 49.4.0",
+    "torch == 2.3.0",
+    "wheel",
+]
+build-backend = "setuptools.build_meta"
+
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+exclude = [
+    # External file, leaving license intact
+    "examples/fp8/quantizer/quantize.py"
+]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    # "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    # "I",
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+]
+
+[tool.mypy]
+python_version = "3.8"
+
+ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "skip"
+
+files = "vllm"
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = [
+    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
+]
+
+[tool.codespell]
+ignore-words-list = "dout, te, indicies, subtile"
+skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+
+[tool.isort]
+use_parentheses = true
+skip_gitignore = true
+
+[tool.pytest.ini_options]
+markers = [
+    "skip_global_cleanup",
+    "vlm: run tests for vision language models only",
+]
diff --git a/setup.py b/setup.py
index e9a72c24b2391..ddf1cdf034c1b 100644
--- a/setup.py
+++ b/setup.py
@@ -207,11 +207,12 @@ def build_extensions(self) -> None:
 
 def _is_hpu() -> bool:
     is_hpu_available = True
-    return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it.
+    return is_hpu_available  # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it.
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if not os.path.exists('/dev/accel/accel0') and not os.path.exists('/dev/accel/accel_controlD0'):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
+                '/dev/accel/accel_controlD0'):
             is_hpu_available = False
     return is_hpu_available
 
@@ -331,17 +332,23 @@ def find_version(filepath: str) -> str:
             return version_match.group(1)
         raise RuntimeError("Unable to find version string.")
 
+
 def get_gaudi_sw_version():
     """
     Returns the driver version.
     """
     # Enable console printing for `hl-smi` check
-    output = subprocess.run(
-        "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"}
-    )
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE,
+                            env={"ENABLE_CONSOLE": "true"})
     if output.returncode == 0 and output.stdout:
-        return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0]
-    return "0.0.0" # when hl-smi is not available
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
+
 
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "version.py"))
@@ -365,7 +372,7 @@ def get_vllm_version() -> str:
             version += f"+neuron{neuron_version_str}"
     elif _is_hpu():
         # Get the Intel Gaudi Software Suite version
-        gaudi_sw_version = str(get_gaudi_sw_version()) 
+        gaudi_sw_version = str(get_gaudi_sw_version())
         if gaudi_sw_version != MAIN_CUDA_VERSION:
             gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
             version += f"+gaudi{gaudi_sw_version}"
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 1f912d5432537..55d9a43b35652 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -116,7 +116,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len : Optional[int] = 4096,
+        max_seq_len: Optional[int] = 4096,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 3c8aebad976b7..09e717f61ac74 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -43,7 +43,8 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: Dict[int, int],
     ) -> None:
-        HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache,
+                                         src_to_dst)
 
     @staticmethod
     def copy_blocks(
@@ -104,7 +105,7 @@ def __post_init__(self):
         # when alibi slopes is used. It is because of the limitation
         # from xformer API.
         # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[torch.Tensor]] = None
+        self.attn_bias: Optional[torch.Tensor] = None
 
 
 class HabanaAttentionImpl(AttentionImpl):
@@ -134,7 +135,7 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len : Optional[int] = 4096,
+        max_seq_len: int = 4096,
     ) -> None:
         self.kv_cache_dtype = kv_cache_dtype
         self.num_heads = num_heads
@@ -144,12 +145,13 @@ def __init__(
         self.sliding_window = sliding_window
         self.position_bias = None
         if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.bfloat16)
-            self.position_bias = _make_alibi_bias(alibi_slopes,
+            alibi_slopes_tensor = torch.tensor(alibi_slopes,
+                                               dtype=torch.bfloat16)
+            self.position_bias = _make_alibi_bias(alibi_slopes_tensor,
                                                   num_kv_heads,
-                                                  alibi_slopes.dtype,
+                                                  alibi_slopes_tensor.dtype,
                                                   max_seq_len)
-        self.alibi_slopes = alibi_slopes
+        self.alibi_slopes = alibi_slopes_tensor
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
@@ -164,9 +166,9 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: Optional[torch.Tensor],
+        kv_cache: torch.Tensor,
         attn_metadata: HabanaAttentionMetadata,
-        kv_scale: float,
+        kv_scale: float = 1.0,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
@@ -192,11 +194,9 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            HabanaPagedAttention.write_to_paged_cache(key, value, key_cache,
-                                                      value_cache,
-                                                      attn_metadata.slot_mapping,
-                                                      self.kv_cache_dtype,
-                                                      attn_metadata.is_prompt)
+            HabanaPagedAttention.write_to_paged_cache(
+                key, value, key_cache, value_cache, attn_metadata.slot_mapping,
+                self.kv_cache_dtype, attn_metadata.is_prompt)
 
         if attn_metadata.is_prompt:
             # Prompt run.
@@ -204,11 +204,15 @@ def forward(
                 # TODO: move this outside of model
                 assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
                 attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None:
-                    attn_bias.add_(self.position_bias[:, :, -attn_bias.size(2):, -attn_bias.size(3):])
-
-                query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
-                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size)
+                if self.alibi_slopes is not None and self.position_bias is not None:
+                    attn_bias.add_(self.position_bias[:, :,
+                                                      -attn_bias.size(2):,
+                                                      -attn_bias.size(3):])
+
+                query_shape = (batch_size, seq_len, self.num_heads,
+                               self.head_size)
+                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
+                            self.head_size)
                 out = ops.prompt_attention(
                     query.view(query_shape),
                     key.view(kv_shape),
@@ -236,17 +240,9 @@ def forward(
         else:
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
-                query,
-                key_cache,
-                value_cache,
-                attn_metadata.block_tables,
-                attn_metadata.seq_lens_tensor,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.position_bias,
-                kv_scale
-            )
+                query, key_cache, value_cache, attn_metadata.block_tables,
+                attn_metadata.seq_lens_tensor, self.kv_cache_dtype,
+                self.num_kv_heads, self.scale, self.position_bias, kv_scale)
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index c8ed500f7af1c..ed47b906168e5 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -54,24 +54,13 @@ def split_kv_cache(
         return key_cache, value_cache
 
     @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        is_prompt: bool
-    ) -> None:
-        cache_ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping,
-            kv_cache_dtype,
-            is_prompt
-        )
+    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
+                             key_cache: torch.Tensor,
+                             value_cache: torch.Tensor,
+                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
+                             is_prompt: bool) -> None:
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping, kv_cache_dtype, is_prompt)
 
     @staticmethod
     def forward_decode(
@@ -115,7 +104,8 @@ def forward_prefix(
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
     ) -> torch.Tensor:
-        raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention")
+        raise NotImplementedError(
+            "forward_prefix is not implemented for HabanaPagedAttention")
 
     @staticmethod
     def swap_blocks(
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index fcc96fae0fb58..6a2ab5c59cf18 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -87,7 +87,7 @@ def get_attn_backend(
         logger.info("Using HabanaAttention backend.")
         from vllm.attention.backends.habana_attn import (  # noqa: F401
             HabanaAttentionBackend)
-        return HabanaAttentionBackend       
+        return HabanaAttentionBackend
     elif backend == _Backend.PALLAS:
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 233be75b47f5a..2bb082385c0f3 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -9,11 +9,12 @@
 if is_hpu():
     import habana_frameworks.torch as htorch
 
+
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group."""
     if is_hpu():
         # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
-        # occuring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
         # (which is required for tensor parallel HPUGraph inference)
         htorch.core.mark_step()
     return get_tp_group().all_reduce(input_)
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index cbb30e39e11a4..a040e187eb0da 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -17,6 +17,7 @@
 
 
 class HabanaExecutor(ExecutorBase):
+
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
         self._init_worker()
@@ -56,6 +57,7 @@ def _create_worker(self,
         wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
                                                       distributed_init_method))
         return wrapper.worker
+
     def _init_worker(self):
         assert self.parallel_config.world_size == 1, (
             "GPUExecutor only supports single GPU.")
@@ -63,13 +65,14 @@ def _init_worker(self):
         self.driver_worker = self._create_worker()
         self.driver_worker.init_device()
         self.driver_worker.load_model()
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Determine the number of available KV blocks by invoking the
         underlying worker.
         """
         return self.driver_worker.determine_num_available_blocks()
 
-    def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None:
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """Initialize the KV cache by invoking the underlying worker.
         """
         # NOTE: This is logged in the executor because there can be >1 worker
@@ -80,7 +83,8 @@ def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None:
 
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        logger.info(f"init_cache_engine took {cache_init_m.get_summary_string()}")
+        logger.info(
+            f"init_cache_engine took {cache_init_m.get_summary_string()}")
 
     def execute_model(
             self,
@@ -89,41 +93,65 @@ def execute_model(
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none
-        log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
-        log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all
-        log_cpu_fallbacks_all = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
-        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
         if log_graph_compilation or log_cpu_fallbacks:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
             seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-            is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list])
-            max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here
-            max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
             input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}'
-            gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext()
-            cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext()
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
             with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = self.driver_worker.execute_model(execute_model_req)
-            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all:
-                logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}")
-            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all:
-                logger.warning(f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}")
-            
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                logger.warning(
+                    f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}"
+                )
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                logger.warning(
+                    f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}"
+                )
+
             return output
 
         output = self.driver_worker.execute_model(execute_model_req)
         return output
-    
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
-    def pin_lora(self) -> List[int]:
+    def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
     def check_health(self) -> None:
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index b9c800e85728b..b57536436bd49 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -283,10 +283,6 @@ def _check_if_any_actor_is_dead(self):
             raise RuntimeError("At least one Worker is dead. "
                                f"Dead Workers: {dead_actors}. ")
 
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        ray.get(parallel_worker_tasks)
 
 class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
 
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4d048ae634457..176b95b720615 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -76,10 +76,12 @@ def initialize_ray_cluster(
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)
     else:
-        ray.init(address=ray_address, ignore_reinit_error=True,
-                 log_to_driver=not os.environ.get('VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0')
+        ray.init(address=ray_address,
+                 ignore_reinit_error=True,
+                 log_to_driver=not os.environ.get(
+                     'VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0')
     ray_accel_name = "HPU" if is_hpu() else "GPU"
-    
+
     if parallel_config.placement_group:
         # Placement group is already set.
         return
@@ -95,7 +97,8 @@ def initialize_ray_cluster(
             bundle_gpus = bundle.get(ray_accel_name, 0)
             if bundle_gpus > 1:
                 raise ValueError(
-                    f"Placement group bundle cannot have more than 1 {ray_accel_name}.")
+                    f"Placement group bundle cannot have more than 1 {ray_accel_name}."
+                )
             if bundle_gpus:
                 gpu_bundles += 1
         if parallel_config.world_size > gpu_bundles:
@@ -109,7 +112,9 @@ def initialize_ray_cluster(
                 f"The number of required {ray_accel_name}s exceeds the total number of "
                 f"available {ray_accel_name}s in the cluster.")
         # Create a new placement group
-        placement_group_specs = ([{ray_accel_name: 1}] * parallel_config.world_size)
+        placement_group_specs = ([{
+            ray_accel_name: 1
+        }] * parallel_config.world_size)
         current_placement_group = ray.util.placement_group(
             placement_group_specs)
         # Wait until PG is ready - this will block until all
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 56aafd2a4d0a9..6457ad3c460f3 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -10,7 +10,13 @@
 import habana_frameworks.torch as htorch
 
 
-def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt=False):
+def reshape_and_cache(key,
+                      value,
+                      key_cache,
+                      value_cache,
+                      slot_mapping,
+                      dtype,
+                      is_prompt=False):
     block_size = key_cache.size(1)
     slot_mapping = slot_mapping.flatten()
     indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
@@ -20,8 +26,8 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, i
 
 
 def swap_blocks(src, dst, block_mapping):
-    index_src = torch.zeros((1,), dtype=torch.int32, device=src.device)
-    index_dst = torch.zeros((1,), dtype=torch.int32, device=dst.device)
+    index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
+    index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
     for src_idx, dst_idx in block_mapping.items():
         index_src[0] = src_idx
         index_dst[0] = dst_idx
@@ -32,15 +38,21 @@ def swap_blocks(src, dst, block_mapping):
 
 
 def copy_blocks(key_caches, value_caches, block_mapping):
-    index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
-    index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device)
+    index_src = torch.zeros((1, ),
+                            dtype=torch.int32,
+                            device=key_caches[0].device)
+    index_dst = torch.zeros((1, ),
+                            dtype=torch.int32,
+                            device=key_caches[0].device)
     for src, dsts in block_mapping.items():
         index_src[0] = src
         for dst in dsts:
             index_dst[0] = dst
             for key_cache in key_caches:
-                key_cache.index_copy_(0, index_dst, key_cache.index_select(0, index_src))
+                key_cache.index_copy_(0, index_dst,
+                                      key_cache.index_select(0, index_src))
             for value_cache in value_caches:
-                value_cache.index_copy_(0, index_dst, value_cache.index_select(0, index_src))
+                value_cache.index_copy_(0, index_dst,
+                                        value_cache.index_select(0, index_src))
         if key_caches[0].device.type == 'hpu':
             htorch.core.mark_step()
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index d38b3731350be..798bee09fda4f 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -23,22 +23,34 @@ def silu_and_mul(output, input):
     output.copy_(silu(x) * y)
 
 
-
 def fetch_from_cache(cache, blocks, permutations):
-    return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))]
+    return [
+        cache.index_select(0, blocks[:, i]).permute(permutations)
+        for i in range(blocks.size(1))
+    ]
 
 
 @hpu_utils.with_mark_steps
-def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes=None, kv_cache_dtype=None) -> None:
+def paged_attention_v1(query,
+                       key_cache,
+                       value_cache,
+                       head_mapping,
+                       scale,
+                       block_tables,
+                       context_lens,
+                       block_size,
+                       alibi_slopes=None,
+                       kv_cache_dtype=None) -> None:
     seq_len = block_tables.size(1)
     batch_size, query_heads, _ = query.shape
     _, _, kv_heads, _ = key_cache.shape
     min_inf = torch.finfo(query.dtype).min
-    mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device)
-            .view(1, -1)
-            .expand(batch_size, -1)
-            .ge(context_lens.view(-1, 1))
-            .view(batch_size, 1, 1, -1))
+    mask = (torch.arange(0,
+                         seq_len * block_size,
+                         dtype=torch.int32,
+                         device=key_cache.device).view(1, -1).expand(
+                             batch_size, -1).ge(context_lens.view(-1, 1)).view(
+                                 batch_size, 1, 1, -1))
     query.mul_(scale)
     query = query.unsqueeze(-2)
     keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1))
@@ -50,10 +62,9 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block
     attn_weights = [torch.matmul(query, k) for k in keys]
     attn_weights = torch.cat(attn_weights, dim=-1)
     if alibi_slopes is not None:
-        attn_weights.add_(alibi_slopes[:,:,-attn_weights.size(2):, -attn_weights.size(3):])
-    attn_weights = (attn_weights
-                    .masked_fill(mask, min_inf)
-                    .softmax(dim=-1))
+        attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
+                                       -attn_weights.size(3):])
+    attn_weights = (attn_weights.masked_fill(mask, min_inf).softmax(dim=-1))
 
     values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3))
     if PA_SPLIT_VALUE:
@@ -82,15 +93,17 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     B, D = hidden_states.shape
     num_experts = w1.shape[0]
     routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-    routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+    routing_weights, selected_experts = torch.topk(routing_weights,
+                                                   topk,
+                                                   dim=-1)
     routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
     routing_weights = routing_weights.to(hidden_states.dtype)
-    final_hidden_states = torch.zeros(
-            (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device
-    )
-    padded_weights = torch.zeros(
-            (B, num_experts), dtype=hidden_states.dtype, device=hidden_states.device
-    )
+    final_hidden_states = torch.zeros((1, B, D),
+                                      dtype=hidden_states.dtype,
+                                      device=hidden_states.device)
+    padded_weights = torch.zeros((B, num_experts),
+                                 dtype=hidden_states.dtype,
+                                 device=hidden_states.device)
     padded_weights.scatter_(-1, selected_experts, routing_weights)
     padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
     padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
@@ -100,7 +113,8 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     for expert_idx in range(num_experts):
         padded_weight = padded_weights[expert_idx]
         current_state_static = hidden_states.reshape(-1, D)
-        w_output = silu_and_mul_wrapper(torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
+        w_output = silu_and_mul_wrapper(
+            torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
         w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
         current_hidden_states_static = w_output * padded_weight
         final_hidden_states += current_hidden_states_static
@@ -111,12 +125,12 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
 
 @hpu_utils.with_mark_steps
 def prompt_attention(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_bias: Optional[torch.Tensor] = None,
-        p: float = 0.0,
-        scale: Optional[float] = None,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
 ) -> torch.Tensor:
     query = query.transpose(1, 2)
     key = key.transpose(1, 2)
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 30f96153cd4a2..16c956acdf817 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -9,18 +9,23 @@
 import torch.nn as nn
 import habana_frameworks.torch.utils.experimental as htexp
 
+
 def get_device_type():
     return htexp._get_device_type()
 
+
 def is_gaudi1():
     return get_device_type() == htexp.synDeviceType.synDeviceGaudi
 
+
 def is_gaudi2():
     return get_device_type() == htexp.synDeviceType.synDeviceGaudi2
 
+
 def is_gaudi3():
     return get_device_type() == htexp.synDeviceType.synDeviceGaudi3
 
+
 # TODO: remove this workaround when FusedRoPE properly works on Gaudi
 if not is_gaudi1() and (is_gaudi2() or is_gaudi3()):
     try:
@@ -34,10 +39,11 @@ def is_gaudi3():
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=-1)
 
+
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -59,40 +65,55 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids]#.unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids]#.unsqueeze(unsqueeze_dim)
+    cos = cos[position_ids]  #.unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids]  #.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
 
 class HpuRotaryEmbedding(nn.Module):
-    def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='hpu'):
+
+    def __init__(self,
+                 head_size,
+                 rotary_dim,
+                 max_position_embeddings=2048,
+                 base=10000,
+                 is_neox_style=None,
+                 device='hpu'):
         super().__init__()
 
         self.head_size = head_size
         self.dim = rotary_dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        inv_freq = 1.0 / (self.base**(
+            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = torch.arange(self.max_seq_len_cached,
+                         device=device,
+                         dtype=self.inv_freq.dtype)
 
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
+        self.register_buffer("cos_cached",
+                             emb.cos().to(dtype),
+                             persistent=False)
+        self.register_buffer("sin_cached",
+                             emb.sin().to(dtype),
+                             persistent=False)
+
+    def forward(self, positions: torch.Tensor, query: torch.Tensor,
+                key: torch.Tensor):
         if query.dim() == 2:
             query = query.unsqueeze(0)
         if key.dim() == 2:
@@ -101,19 +122,31 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
             positions = positions.unsqueeze(0)
         seq_len = key.shape[-2]
         if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
-
-        cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
-        query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
-        key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
+            self._set_cos_sin_cache(seq_len=seq_len,
+                                    device=query.device,
+                                    dtype=query.dtype)
+
+        cos, sin = self.cos_cached[:seq_len].to(
+            dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
+        query = query.reshape(
+            (query.shape[0], query.shape[1], query.shape[2] // self.head_size,
+             self.head_size))
+        key = key.reshape((key.shape[0], key.shape[1],
+                           key.shape[2] // self.head_size, self.head_size))
         if query.device.type == "hpu" and FusedRoPE:
             if len(positions[0]) == 1:
-                cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-                sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+                cos = self.cos_cached[positions].unsqueeze(2).to(
+                    dtype=query.dtype)
+                sin = self.sin_cached[positions].unsqueeze(2).to(
+                    dtype=query.dtype)
             else:
                 cos = cos[positions].unsqueeze(2)
                 sin = sin[positions].unsqueeze(2)
-            query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
+            query, key = FusedRoPE.apply(query, cos, sin,
+                                         0), FusedRoPE.apply(key, cos, sin, 0)
         else:
             query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
-        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
+        return query.reshape(
+            (query.shape[0], query.shape[1],
+             query.shape[2] * query.shape[3])), key.reshape(
+                 (key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
index 4ce9e2591c6b9..06f3690aded8b 100644
--- a/vllm/hpu/utils.py
+++ b/vllm/hpu/utils.py
@@ -7,7 +7,9 @@
 
 import habana_frameworks.torch as htorch
 
+
 def with_mark_steps(fn):
+
     def wrapped(*args, **kwargs):
         htorch.core.mark_step()
         result = fn(*args, **kwargs)
@@ -15,4 +17,5 @@ def wrapped(*args, **kwargs):
         del kwargs
         htorch.core.mark_step()
         return result
-    return wrapped
\ No newline at end of file
+
+    return wrapped
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 69f889ed1a1b8..b2641cf89bdc5 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -45,7 +45,7 @@ def forward_hpu(self, x: torch.Tensor) -> torch.Tensor:
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
         ops.silu_and_mul(out, x)
         return out
-    
+
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d0d1577b26a10..975019bc9c24d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -13,6 +13,7 @@
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None
 
+
 class RMSNorm(CustomOp):
     """Root mean square normalization.
 
@@ -86,7 +87,8 @@ def forward_hpu(
                 orig_shape = x.shape
                 residual += x.view(residual.shape)
                 # Note: FusedRMSNorm requires 3D tensors as inputs
-                x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon)
+                x = FusedRMSNorm.apply(residual.float(), self.weight.float(),
+                                       self.variance_epsilon)
                 return x.to(orig_dtype).view(orig_shape), residual
             ops.fused_add_rms_norm(
                 x,
@@ -97,7 +99,8 @@ def forward_hpu(
             return x, residual
         if x.device.type == "hpu" and FusedRMSNorm:
             orig_dtype = x.dtype
-            x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon)
+            x = FusedRMSNorm.apply(x.float(), self.weight.float(),
+                                   self.variance_epsilon)
             return x.to(orig_dtype)
         out = torch.empty_like(x)
         ops.rms_norm(
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 39d142b158445..321de3491921d 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -92,8 +92,8 @@ def _prune_hidden_states(
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
     if sampling_metadata.selected_token_indices is not None:
-        return hidden_states.index_select(0,
-                                          sampling_metadata.selected_token_indices)
+        return hidden_states.index_select(
+            0, sampling_metadata.selected_token_indices)
     else:
         return hidden_states
 
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index c8c0225245f7d..ceaa2ddd3d553 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -32,6 +32,7 @@
 if is_hpu():
     from vllm.hpu.rotary_embed import HpuRotaryEmbedding
 
+
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2:]
@@ -763,11 +764,11 @@ def get_rope(
         return _ROPE_DICT[key]
     if rope_scaling is None:
         if is_hpu():
-            rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                            is_neox_style)
+            rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim,
+                                            max_position, base, is_neox_style)
         else:
-            rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                            is_neox_style, dtype)
+            rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position,
+                                         base, is_neox_style, dtype)
     else:
         scaling_type = rope_scaling["type"]
         # The correct one should be "longrope" but keep "su" here
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index cade78114be42..ad146da72fb26 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -262,7 +262,8 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            load_device = torch.device(device_config.device) if not is_hpu() else 'cpu' # FIXME(kzawora): this is a nasty workaround!!!
+            load_device = torch.device(device_config.device) if not is_hpu(
+            ) else 'cpu'  # FIXME(kzawora): this is a nasty workaround!!!
             with torch.device(load_device):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, vision_language_config,
@@ -284,7 +285,8 @@ def load_model(self, *, model_config: ModelConfig,
                 if hasattr(module, "process_weights_after_loading"):
                     module.process_weights_after_loading()
         if is_hpu():
-            model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!!
+            model = model.to(
+                'hpu')  # FIXME(kzawora): this is a nasty workaround!!!
         return model.eval()
 
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 33acc63c3fc0c..0344f1c7c7a03 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -279,10 +279,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         if is_hpu():
             final_hidden_states = static_fused_moe(hidden_states,
-                        self.w13_weight,
-                        self.w2_weight,
-                        router_logits,
-                        self.top_k)
+                                                   self.w13_weight,
+                                                   self.w2_weight,
+                                                   router_logits, self.top_k)
         else:
             final_hidden_states = fused_moe(hidden_states,
                                             self.w13_weight,
@@ -301,8 +300,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
-        return (final_hidden_states.view(batch_size, sequence_length, hidden_size) if is_hpu() 
-                else final_hidden_states.view(num_tokens, hidden_size))
+        return (final_hidden_states.view(batch_size, sequence_length,
+                                         hidden_size) if is_hpu() else
+                final_hidden_states.view(num_tokens, hidden_size))
 
 
 class MixtralAttention(nn.Module):
@@ -651,10 +651,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
-            
+
             if is_hpu():
                 torch.hpu.synchronize()
 
+
 def all_close_1d(x: torch.Tensor) -> bool:
     assert len(x.shape) == 1
     return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 7ff826cf4e18f..ea82a3a4041b7 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -510,7 +510,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             dtype=torch.int,
             pin_memory=pin_memory,
         )
-        idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support 
+        idx_dtype = torch.long if not is_hpu(
+        ) else torch.int  # Gaudi doesn't have full native int64 support
         sample_indices_t = torch.tensor(
             sample_indices,
             device="cpu",
@@ -598,7 +599,8 @@ def _get_sequence_seeds(
             else:
                 generator = random.Random(str((seed, ) + extra_entropy))
                 randint_fn = generator.randint
-            idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support 
+            idx_dtype = torch.long if not is_hpu(
+            ) else torch.int  # Gaudi doesn't have full native int64 support
             lo, hi = torch.iinfo(idx_dtype).min, torch.iinfo(idx_dtype).max
             # If the user/random sets seed = 0 but request should
             # have sampling, we need to change it to something
diff --git a/vllm/utils.py b/vllm/utils.py
index 2fb77a0fc431c..520332110fd1f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -194,9 +194,12 @@ def is_neuron() -> bool:
         transformers_neuronx = None
     return transformers_neuronx is not None
 
+
 @lru_cache(maxsize=None)
 def is_hpu() -> bool:
-    return importlib.util.find_spec('habana_frameworks') is not None
+    from importlib import util
+    return util.find_spec('habana_frameworks') is not None
+
 
 @lru_cache(maxsize=None)
 def is_tpu() -> bool:
@@ -506,18 +509,14 @@ def create_kv_caches_with_random(
     torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
     scale = head_size**-0.5
-    if is_hpu():
-        key_cache_shape = (num_blocks, block_size, num_heads, head_size)
-    else:
-        x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
-        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
     key_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
                                 device=device)
-        cache_dtype = str(cache_dtype)
-        if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]:
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
             _generate_random_fp8(key_cache, -scale, scale)
@@ -526,16 +525,13 @@ def create_kv_caches_with_random(
                 f"Does not support key cache of type {cache_dtype}")
         key_caches.append(key_cache)
 
-    if is_hpu():
-        value_cache_shape = (num_blocks, block_size, num_heads, head_size)
-    else:
-        value_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
     value_caches: List[torch.Tensor] = []
     for _ in range(num_layers):
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
                                   device=device)
-        if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]:
+        if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             value_cache.uniform_(-scale, scale)
         elif cache_dtype == 'fp8':
             _generate_random_fp8(value_cache, -scale, scale)
@@ -607,55 +603,70 @@ class HabanaMemoryProfiler:
     def __init__(self, device=None):
         self.device = device
 
+    @staticmethod
     def current_device_memory_usage() -> float:
         # Return the device memory usage in bytes.
         free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory - free_hpu_memory
-    
+
+    @staticmethod
     def current_free_device_memory() -> float:
         # Return the device memory usage in bytes.
         free_hpu_memory, _ = torch.hpu.mem_get_info()
         return free_hpu_memory
-    
+
+    @staticmethod
     def total_device_memory() -> float:
         # Return the device memory usage in bytes.
         _, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory
 
+    @staticmethod
     def current_host_memory_usage() -> float:
         # Return the host memory usage in bytes.
-        return HabanaMemoryProfiler.total_host_memory() - HabanaMemoryProfiler.current_free_host_memory()
-    
+        return HabanaMemoryProfiler.total_host_memory(
+        ) - HabanaMemoryProfiler.current_free_host_memory()
+
+    @staticmethod
     def current_free_host_memory() -> float:
         # Return the host memory usage in bytes.
         return psutil.virtual_memory().available
-    
+
+    @staticmethod
     def total_host_memory() -> float:
         # Return the host memory usage in bytes.
         return psutil.virtual_memory().total
 
     def get_summary_string(self):
-        if getattr(self, 'final_device_memory', None) is None or getattr(self, 'final_host_memory', None) is None:
-            raise RuntimeError("HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager")
-        return (f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and "
-                f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
+        if getattr(self, 'final_device_memory', None) is None or getattr(
+                self, 'final_host_memory', None) is None:
+            raise RuntimeError(
+                "HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager"
+            )
+        return (
+            f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and "
+            f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)"
+        )
 
     def __enter__(self):
         # Force garbage collection
         gc.collect()
-        self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage()
-        self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage()
+        self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage(
+        )
+        self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
+        )
         # This allows us to call methods of the context manager if needed
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # Force garbage collection
         gc.collect()
-        self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage()
-        self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage()
+        self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage(
+        )
+        self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
+        )
         self.consumed_device_memory = self.final_device_memory - self.initial_device_memory
         self.consumed_host_memory = self.final_host_memory - self.initial_host_memory
-        
 
 
 # Adapted from https://stackoverflow.com/a/49361727
@@ -663,7 +674,7 @@ def format_bytes(size):
     # 2**10 = 1024
     power = 2**10
     n = 0
-    power_labels = {0 : '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
+    power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
     while abs(size) > power:
         size /= power
         n += 1
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index ea53c3e99d9fa..961b5689e43a4 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -76,11 +76,11 @@ def _allocate_kv_cache(
         for _ in range(self.num_layers):
             if device == 'hpu':
                 key_cache = torch.zeros(kv_cache_shape,
-                        dtype=self.dtype,
-                        device=device)
+                                        dtype=self.dtype,
+                                        device=device)
                 value_cache = torch.zeros(kv_cache_shape,
-                        dtype=self.dtype,
-                        device=device)
+                                          dtype=self.dtype,
+                                          device=device)
                 kv_layer = (key_cache, value_cache)
                 kv_cache.append(kv_layer)
             else:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 21d34a3924c86..e8e7c35579b0f 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1,3 +1,5 @@
+# mypy: ignore-errors
+
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
@@ -16,8 +18,9 @@
 import habana_frameworks.torch as htorch
 import contextlib
 from vllm.attention import (AttentionMetadata, get_attn_backend)
-from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig,
+                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
@@ -47,7 +50,11 @@
 # example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
 def read_bucket_settings(phase: str, dim: str, **defaults: Dict):
     params = ['min', 'step', 'max']
-    values = [int(os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p])) for p in params]
+    values = [
+        int(
+            os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(),
+                           defaults[p])) for p in params
+    ]
     return values
 
 
@@ -61,7 +68,8 @@ def warmup_range(config: Tuple[int, int, int]):
 
 
 def warmup_buckets(bs_bucket_config, seq_bucket_config):
-    buckets = itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config))
+    buckets = itertools.product(warmup_range(bs_bucket_config),
+                                warmup_range(seq_bucket_config))
     return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
 
 
@@ -86,13 +94,17 @@ def find_bucket(value: int, config: Tuple[int, int, int]):
     return result
 
 
-def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[str, object] = {}):
+def subtuple(obj: object,
+             typename: str,
+             to_copy: List[str],
+             to_override: Dict[str, object] = {}):
     if obj is None:
         return None
     fields = set(to_copy) | set(to_override.keys())
     values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
     if typename not in _TYPE_CACHE:
-        _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields))
+        _TYPE_CACHE[typename] = collections.namedtuple(typename,
+                                                       ' '.join(fields))
     return _TYPE_CACHE[typename](**values)
 
 
@@ -107,39 +119,44 @@ def align_workers(value, op):
 
 
 class HpuModelAdapter():
+
     def __init__(self, model):
         self.model = model
 
-    def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype):
+    def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
+                       dtype):
         prefill_metadata = attn_metadata
         if prefill_metadata is None:
             return attn_metadata
 
         seq_lens_t = prefill_metadata.seq_lens_tensor
-        len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32)
-                    .view(1, seq_len)
-                    .ge(seq_lens_t.unsqueeze(-1))
-                    .view(batch_size, 1, 1, seq_len))
-        causal_mask = torch.triu(
-            torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool),
-            diagonal=1
-        )
+        len_mask = (torch.arange(0, seq_len, device=device,
+                                 dtype=torch.int32).view(1, seq_len).ge(
+                                     seq_lens_t.unsqueeze(-1)).view(
+                                         batch_size, 1, 1, seq_len))
+        causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len),
+                                            device=device,
+                                            dtype=torch.bool),
+                                 diagonal=1)
         mask = causal_mask.logical_or(len_mask)
-        attn_bias = (torch.zeros_like(mask, dtype=dtype)
-                      .masked_fill_(mask, -math.inf))
+        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
+            mask, -math.inf))
         #FIXME: Restore sliding window support
         #if self.sliding_window is not None:
         attn_metadata = prefill_metadata._replace(attn_bias=attn_bias)
         return attn_metadata
 
-
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
         selected_token_indices = kwargs.pop('selected_token_indices')
         if 'bypass_hpu_graphs' in kwargs:
-            kwargs.pop('bypass_hpu_graphs') # required for PT eager
+            kwargs.pop('bypass_hpu_graphs')  # required for PT eager
         input_ids = kwargs['input_ids']
-        kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16)
+        kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'],
+                                                      input_ids.size(0),
+                                                      input_ids.size(1),
+                                                      input_ids.device,
+                                                      torch.bfloat16)
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)
@@ -282,17 +299,20 @@ def load_model(self) -> None:
                     vision_language_config=self.vision_language_config,
                     parallel_config=self.parallel_config,
                     scheduler_config=self.scheduler_config,
-                    cache_config=self.cache_config
-                )
-            logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}")
+                    cache_config=self.cache_config)
+            logger.info(
+                f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}"
+            )
 
             # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = _maybe_wrap_in_hpu_graph(self.model)
-            logger.info(f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}")
-            
+            logger.info(
+                f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}")
+
         self.model_memory_usage = m.consumed_device_memory
-        logger.info(f"Loading model weights took in total {m.get_summary_string()}")
+        logger.info(
+            f"Loading model weights took in total {m.get_summary_string()}")
 
         if self.lora_config:
             assert hasattr(self.model, "supported_lora_modules"
@@ -316,19 +336,47 @@ def _use_graphs(self, batch_size, seq_len, is_prompt):
         return (batch_size, seq_len, is_prompt) in self.graphed_buckets
 
     def _setup_buckets(self) -> None:
-        self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64))
-        self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs)
-        self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024)
-        self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048)
+        self.prompt_bs_bucket_cfg = read_bucket_settings('prompt',
+                                                         'bs',
+                                                         min=1,
+                                                         step=32,
+                                                         max=min(
+                                                             self.max_num_seqs,
+                                                             64))
+        self.decode_bs_bucket_cfg = read_bucket_settings('decode',
+                                                         'bs',
+                                                         min=1,
+                                                         step=128,
+                                                         max=self.max_num_seqs)
+        self.prompt_seq_bucket_cfg = read_bucket_settings('prompt',
+                                                          'seq',
+                                                          min=self.block_size,
+                                                          step=self.block_size,
+                                                          max=1024)
+        self.decode_seq_bucket_cfg = read_bucket_settings('decode',
+                                                          'seq',
+                                                          min=self.block_size,
+                                                          step=self.block_size,
+                                                          max=2048)
         self.graphed_buckets = set()
 
-        logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}")
-        self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg)
-        logger.info(f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}")
+        logger.info(
+            f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}"
+        )
+        self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg,
+                                             self.prompt_seq_bucket_cfg)
+        logger.info(
+            f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}"
+        )
 
-        logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}")
-        self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg)
-        logger.info(f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}")
+        logger.info(
+            f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}"
+        )
+        self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
+                                             self.decode_seq_bucket_cfg)
+        logger.info(
+            f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}"
+        )
 
     def _prepare_prompt(
         self,
@@ -451,7 +499,7 @@ def _prepare_prompt(
 
         max_query_len = max(query_lens)
         sum_query_len = sum(query_lens)
-        real_num_seqs = len(query_lens) 
+        real_num_seqs = len(query_lens)
         assert max_query_len > 0
 
         context_lens_tensor = torch.tensor(context_lens,
@@ -468,7 +516,9 @@ def _prepare_prompt(
             multi_modal_input = None
 
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
-        max_prompt_len = max(find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size)
+        max_prompt_len = max(
+            find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
+            self.block_size)
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_prompt_len,
@@ -521,7 +571,7 @@ def _prepare_prompt(
             use_cuda_graph=False,
             num_prefills=real_num_seqs,
             num_prefill_tokens=sum_query_len,
-            num_decode_tokens=0, 
+            num_decode_tokens=0,
             slot_mapping=slot_mapping,
         )
         return PreparePromptMetadata(
@@ -685,7 +735,9 @@ def prepare_input_tensors(
             num_decode_tokens = len(decode_input_tokens)
 
             # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing.
-            assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!"
+            assert (num_prefills == 0 and num_decode_tokens > 0) or (
+                num_prefills > 0 and num_decode_tokens
+                == 0), "HPU does not support mixed batches!"
             if num_decode_tokens > 0:
                 input_tokens = decode_input_tokens
                 input_positions = decode_input_positions
@@ -694,12 +746,15 @@ def prepare_input_tensors(
                 lora_prompt_mapping = decode_lora_prompt_mapping
                 lora_requests = decode_lora_requests
 
-            # FIXME: We need to adjust selected_token_indices to accomodate for padding
+            # FIXME: We need to adjust selected_token_indices to accommodate for padding
             max_len = input_tokens.size(1)
             paddings = [max_len - s for s in seq_lens]
             paddings = [0] + paddings[:-1]
             paddings = list(itertools.accumulate(paddings))
-            paddings = torch.tensor(paddings, dtype=sampling_metadata.selected_token_indices.dtype, device=sampling_metadata.selected_token_indices.device)
+            paddings = torch.tensor(
+                paddings,
+                dtype=sampling_metadata.selected_token_indices.dtype,
+                device=sampling_metadata.selected_token_indices.device)
             sampling_metadata.selected_token_indices.add_(paddings)
 
             if self.lora_config:
@@ -713,7 +768,8 @@ def prepare_input_tensors(
             if (prefill_attn_metadata is not None
                     and decode_attn_metadata is not None):
                 batch_type = BatchType.MIXED
-                raise NotImplementedError("Mixed batch is not supported on HPU")
+                raise NotImplementedError(
+                    "Mixed batch is not supported on HPU")
             elif prefill_attn_metadata is not None:
                 batch_type = BatchType.PREFILL
             else:
@@ -782,15 +838,15 @@ def prepare_input_tensors(
                     **metadata_dict)
 
         attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata
-#        attn_metadata = AttentionMetadata(
-#            num_prefills=num_prefills,
-#            slot_mapping=slot_mapping,
-#            num_prefill_tokens=num_prefill_tokens,
-#            num_decode_tokens=num_decode_tokens,
-#            prefill_metadata=prefill_attn_metadata,
-#            decode_metadata=decode_attn_metadata,
-#            kv_cache_dtype=self.kv_cache_dtype,
-#        )
+        #        attn_metadata = AttentionMetadata(
+        #            num_prefills=num_prefills,
+        #            slot_mapping=slot_mapping,
+        #            num_prefill_tokens=num_prefill_tokens,
+        #            num_decode_tokens=num_decode_tokens,
+        #            prefill_metadata=prefill_attn_metadata,
+        #            decode_metadata=decode_attn_metadata,
+        #            kv_cache_dtype=self.kv_cache_dtype,
+        #        )
 
         return (input_tokens, input_positions, attn_metadata,
                 sampling_metadata, lora_requests, lora_mapping,
@@ -807,29 +863,26 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         # Trimming metadata is required when using HPUGraphs.
         # Attention metadata is going to be hashed by PT bridge, and
         # appropriate HPUGraphs will be matched based on all inputs' hash.
-        
-        # Before you put more keys in here, make sure you know their 
-        # value type and make sure you know how it's going to be hashed. 
-        # You can find that information in input_hash function 
+
+        # Before you put more keys in here, make sure you know their
+        # value type and make sure you know how it's going to be hashed.
+        # You can find that information in input_hash function
         # in habana_frameworks/torch/hpu/graphs.py. You can also hash
         # it manually with torch.hpu.graphs.input_hash(attention_metadata)
-        
+
         # If you use primitive types here - they will get hashed based
         # on their value. You *will* get lots of excessive graph captures
         # (and an OOM eventually) if you decide to put something like
-        # seq_len int here. 
-        # If you absolutely need a scalar, put it in a tensor. Tensors 
+        # seq_len int here.
+        # If you absolutely need a scalar, put it in a tensor. Tensors
         # get hashed using their metadata, not their values:
         # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
         # input_hash(123) != input_hash(321)
         # input_hash("abc") != input_hash("cba")
-        attention_metadata = subtuple(metadata,
-                                      'TrimmedAttentionMetadata',
-                                      ['block_tables',
-                                       'seq_lens_tensor',
-                                       'attn_bias',
-                                       'slot_mapping',
-                                       'is_prompt'])
+        attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
+            'block_tables', 'seq_lens_tensor', 'attn_bias', 'slot_mapping',
+            'is_prompt'
+        ])
         return attention_metadata
 
     @torch.inference_mode()
@@ -849,11 +902,12 @@ def execute_model(
             batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
             batch_size_padding = batch_size_padded - real_batch_size
             seq_group_metadata_list = seq_group_metadata_list.copy()
-            seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding))
+            seq_group_metadata_list.extend(seq_group_metadata_list[0]
+                                           for _ in range(batch_size_padding))
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
             (input_tokens, input_positions, attn_metadata, sampling_metadata,
-            lora_requests, lora_mapping, multi_modal_input
-            ) = self.prepare_input_tensors(seq_group_metadata_list)
+             lora_requests, lora_mapping, multi_modal_input
+             ) = self.prepare_input_tensors(seq_group_metadata_list)
             is_prompt = attn_metadata.is_prompt
 
         if self.lora_config:
@@ -877,12 +931,20 @@ def execute_model(
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs)
+            hidden_states = self.model.forward(
+                **execute_model_kwargs,
+                selected_token_indices=sampling_metadata.
+                selected_token_indices,
+                bypass_hpu_graphs=not use_graphs)
 
         # Compute the logits.
-        with self.profiler.record_event('internal', f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'):
+        with self.profiler.record_event(
+                'internal',
+                f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'
+        ):
             sampling_metadata.selected_token_indices = None
-            logits = self.model.compute_logits(hidden_states, sampling_metadata)
+            logits = self.model.compute_logits(hidden_states,
+                                               sampling_metadata)
         htorch.core.mark_step()
 
         # Only perform sampling in the driver worker.
@@ -890,7 +952,10 @@ def execute_model(
             return None
 
         # Sample the next token.
-        with self.profiler.record_event('internal', f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'):
+        with self.profiler.record_event(
+                'internal',
+                f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'
+        ):
             output = self.model.sample(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
@@ -903,12 +968,12 @@ def execute_model(
             self.profiler.end()
             event_end = self.profiler.get_timestamp_us()
             counters = self.profiler_counter_helper.get_counter_dict(
-                cache_config=self.cache_config, 
-                duration=event_end-event_start, 
-                seq_len=seq_len, 
-                batch_size_padded=batch_size_padded, 
-                real_batch_size=real_batch_size, 
-                seq_group_metadata_list=seq_group_metadata_list, 
+                cache_config=self.cache_config,
+                duration=event_end - event_start,
+                seq_len=seq_len,
+                batch_size_padded=batch_size_padded,
+                real_batch_size=real_batch_size,
+                seq_group_metadata_list=seq_group_metadata_list,
                 is_prompt=is_prompt)
             self.profiler.record_counter(event_start, counters)
 
@@ -945,12 +1010,16 @@ def profile_run(self) -> None:
 
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches)
 
-    def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
+    def warmup_scenario(self, batch_size, seq_len, is_prompt,
+                        kv_caches) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
         scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}"
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs else 1
-        seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)]
+        seqs = [
+            self.create_dummy_seq_group_metadata(i, seq_len, is_prompt)
+            for i in range(batch_size)
+        ]
         torch.hpu.synchronize()
         for _ in range(times):
             self.execute_model(seqs, kv_caches)
@@ -959,16 +1028,22 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None:
         gc.collect()
 
     def log_warmup(self, phase, i, max_i, batch_size, seq_len):
-        free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory())
-        logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}")
+        free_mem = format_bytes(
+            HabanaMemoryProfiler.current_free_device_memory())
+        logger.info(
+            f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}"
+        )
 
     def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
         for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
-            mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage() / HabanaMemoryProfiler.total_device_memory()
-            self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len)
+            mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage(
+            ) / HabanaMemoryProfiler.total_device_memory()
+            self.log_warmup('Prompt' if is_prompt else 'Decode', i,
+                            len(buckets), batch_size, seq_len)
             self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
 
-    def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
+    def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
+                      available_mem):
         total_batch_seq = 0.001
         total_mem = 0
         idx = 0
@@ -980,7 +1055,8 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
         elif strategy == 'max_bs':
             ordering = lambda b: (-b[0], b[1])
         else:
-            raise NotImplementedError(f'Unsupported graph allocation strategy: {strategy}')
+            raise NotImplementedError(
+                f'Unsupported graph allocation strategy: {strategy}')
         buckets = list(sorted(buckets, key=ordering))
 
         for idx, (batch_size, seq_len) in enumerate(buckets):
@@ -993,12 +1069,16 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem):
             self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
             with HabanaMemoryProfiler() as mem_prof:
                 self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
-            used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX)
+            used_mem = align_workers(mem_prof.consumed_device_memory,
+                                     torch.distributed.ReduceOp.MAX)
             available_mem -= used_mem
             total_mem += used_mem
             total_batch_seq += batch_seq
-        graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt)
-        logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}')
+        graphed = list(c[:2] for c in self.graphed_buckets
+                       if c[2] == is_prompt)
+        logger.info(
+            f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}'
+        )
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
@@ -1012,49 +1092,68 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 
         if not self.enforce_eager:
-            mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
-            free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory()
+            mem_margin = 1.0 - float(
+                os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
+            free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory(
+            )
             free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN)
-            prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
+            prompt_graph_mem_ratio = float(
+                os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
             prompt_available_memory = prompt_graph_mem_ratio * free_mem
             decode_available_memory = free_mem - prompt_available_memory
             prompt_strategy = 'min_tokens'
-            decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', 'max_bs')
-            self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, kv_caches, prompt_available_memory)
-            self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory)
+            decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
+                                             'max_bs')
+            self.warmup_graphs(prompt_strategy, self.prompt_buckets, True,
+                               kv_caches, prompt_available_memory)
+            self.warmup_graphs(decode_strategy, self.decode_buckets, False,
+                               kv_caches, decode_available_memory)
 
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()
         elapsed_time = end_time - start_time
-        logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory")
+        logger.info(
+            f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory"
+        )
         self.profiler.end()
 
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+
 def _maybe_wrap_in_hpu_graph(model):
-    return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)
+    return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(
+        model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)
 
 
 class HabanaProfilerCounterHelper():
+
     def __init__(self):
         self.niter = 0
         self.average_real_throughput = None
         self.logged_once = False
-    
-    def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, real_batch_size, seq_group_metadata_list, is_prompt):
+
+    def get_counter_dict(self, cache_config, duration, seq_len,
+                         batch_size_padded, real_batch_size,
+                         seq_group_metadata_list, is_prompt):
         throughput = batch_size_padded / (duration / 1e6)
         throughput_effective = real_batch_size / (duration / 1e6)
-        real_seq_lens = [len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
+        real_seq_lens = [
+            len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids)
+            for seq_group_metadata in seq_group_metadata_list
+            for seq_data in seq_group_metadata.seq_data.values()
+        ]
         real_max_seq_len = max(real_seq_lens)
         real_num_tokens = sum(real_seq_lens)
         padded_num_tokens = batch_size_padded * seq_len
         batch_token_utilization = real_num_tokens / padded_num_tokens
         if self.average_real_throughput is None:
             self.average_real_throughput = throughput_effective
-        else: # https://www.heikohoffmann.de/htmlthesis/node134.html
-            self.average_real_throughput = self.average_real_throughput + 1/(self.niter+1) * (throughput_effective-self.average_real_throughput)
+        else:  # https://www.heikohoffmann.de/htmlthesis/node134.html
+            self.average_real_throughput = self.average_real_throughput + 1 / (
+                self.niter + 1) * (throughput_effective -
+                                   self.average_real_throughput)
         phase = "prompt" if is_prompt else "decode"
         counters = {
             f'{phase}_bucket_batch_size': batch_size_padded,
@@ -1067,30 +1166,41 @@ def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, r
             'average_real_throughput': self.average_real_throughput,
             'engine_iteration': self.niter,
         }
-        self.niter += 1 
+        self.niter += 1
         if is_prompt:
-            prompt_seq_lens = [len(seq_data.prompt_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()]
-            prompt_bucket_in_throughput = (seq_len*batch_size_padded) / (duration / 1e6) 
-            prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) 
-            counters[f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
+            prompt_seq_lens = [
+                len(seq_data.prompt_token_ids)
+                for seq_group_metadata in seq_group_metadata_list
+                for seq_data in seq_group_metadata.seq_data.values()
+            ]
+            prompt_bucket_in_throughput = (seq_len * batch_size_padded) / (
+                duration / 1e6)
+            prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6)
+            counters[
+                f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
             counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
 
         # KV cache might not be created yet (e.g. for profiling run)
         if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0:
-            cache_num_blocks_used = [math.ceil(sl/cache_config.block_size) for sl in real_seq_lens]
+            cache_num_blocks_used = [
+                math.ceil(sl / cache_config.block_size) for sl in real_seq_lens
+            ]
             cache_total_num_blocks_used = sum(cache_num_blocks_used)
-            num_cache_blocks = cache_config.num_gpu_blocks 
+            num_cache_blocks = cache_config.num_gpu_blocks
             cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used
             cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks
-            max_blocks_per_seq = math.ceil(seq_len/cache_config.block_size)
-            batch_block_utilization = cache_total_num_blocks_used / (batch_size_padded * max_blocks_per_seq)
+            max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size)
+            batch_block_utilization = cache_total_num_blocks_used / (
+                batch_size_padded * max_blocks_per_seq)
             counters['cache_num_blocks_used'] = cache_total_num_blocks_used
             counters['cache_num_free_blocks'] = cache_total_num_free_blocks
             counters['cache_computed_utilization'] = cache_computed_utilization
-            counters[f'{phase}_batch_block_utilization'] = batch_block_utilization
+            counters[
+                f'{phase}_batch_block_utilization'] = batch_block_utilization
         if not self.logged_once:
             counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
-            counters['const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization
+            counters[
+                'const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization
             counters['const_block_size'] = cache_config.block_size
             self.logged_once = True
         return counters
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index e1c374124633f..8b53615805291 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -72,15 +72,16 @@ def __init__(
                 "To be tested: vision language model with LoRA settings.")
             assert False, "To be tested: vision language model on HPU"
 
-        self.model_runner = HabanaModelRunner(model_config,
-                                              parallel_config,
-                                              scheduler_config,
-                                              device_config,
-                                              load_config=load_config,
-                                              cache_config=cache_config,
-                                              lora_config=self.lora_config,
-                                              kv_cache_dtype=self.cache_config.cache_dtype,
-                                              is_driver_worker=is_driver_worker)
+        self.model_runner = HabanaModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            load_config=load_config,
+            cache_config=cache_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: CacheEngine
@@ -129,14 +130,21 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         free_hpu_memory = torch.hpu.mem_get_info()[0]
 
         cache_block_size = self.get_cache_block_size_bytes()
-        graph_headroom = 1 - (float(os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) if not self.model_config.enforce_eager else 0)
-        num_hpu_blocks = int(free_hpu_memory * graph_headroom * self.cache_config.gpu_memory_utilization // cache_block_size)
+        graph_headroom = 1 - (float(
+            os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4'))
+                              if not self.model_config.enforce_eager else 0)
+        num_hpu_blocks = int(free_hpu_memory * graph_headroom *
+                             self.cache_config.gpu_memory_utilization //
+                             cache_block_size)
         num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                              cache_block_size)
         num_hpu_blocks = max(num_hpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
+
+        # NOTE(kzawora): Restore this once LoRA support is added
+        # if self.model_runner.lora_manager:
+        #     self.model_runner.remove_all_loras()
+
         gc.collect()
         return num_hpu_blocks, num_cpu_blocks
 
@@ -159,9 +167,11 @@ def initialize_cache(self, num_gpu_blocks: int,
     def _init_cache_engine(self) -> None:
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config, self.device_config)
+                                        self.parallel_config,
+                                        self.device_config)
         self.hpu_cache = self.cache_engine.gpu_cache
-        htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution
+        htorch.hpu.synchronize(
+        )  # we want to materialize cache tensors before we proceed with graph capture/execution
 
     def _warm_up_model(self) -> None:
         self.model_runner.warmup_model(self.hpu_cache)
@@ -260,7 +270,6 @@ def _execute_model_non_driver(self) -> bool:
         self.model_runner.execute_model(None, self.hpu_cache)
         return True
 
-
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
@@ -296,8 +305,11 @@ def init_worker_distributed_environment(
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
-    init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank, backend='hccl')
+    init_distributed_environment(parallel_config.world_size,
+                                 rank,
+                                 distributed_init_method,
+                                 local_rank,
+                                 backend='hccl')
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
index 34221d2553909..9e181e27bde34 100644
--- a/vllm/worker/profiler.py
+++ b/vllm/worker/profiler.py
@@ -11,11 +11,13 @@
 
 from vllm.logger import init_logger
 from vllm.utils import get_vllm_instance_id
+from typing import List, Any
 
 logger = init_logger(__name__)
 
 
 class FileWriter(threading.Thread):
+
     def __init__(self, filename, event_queue):
         super().__init__()
         self.filename = filename
@@ -46,11 +48,11 @@ def run(self):
 
 
 class Profiler:
-    profiling_trace_events = queue.Queue()
+    profiling_trace_events: queue.Queue = queue.Queue()
     event_tid = {'counter': 1, 'external': 2, 'internal': 3}
     vllm_instance_id = get_vllm_instance_id()
     filename = f'server_events_{vllm_instance_id}.json'
-    event_cache = []
+    event_cache: List[Any] = []
 
     def __init__(self):
         self.enabled = os.getenv('VLLM_PROFILER_ENABLED',

From 23a9797240304770d66528b74814cbdd780b1512 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 18:15:15 +0300
Subject: [PATCH 070/341] ruff --fix

---
 vllm/attention/backends/habana_attn.py | 2 +-
 vllm/executor/habana_executor.py       | 2 +-
 vllm/executor/ray_utils.py             | 3 +--
 vllm/hpu/cache_ops.py                  | 1 -
 vllm/hpu/ops.py                        | 3 +--
 vllm/utils.py                          | 1 -
 vllm/worker/cache_engine.py            | 2 +-
 vllm/worker/habana_model_runner.py     | 1 -
 8 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 09e717f61ac74..2fdb3d4f9c382 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -202,7 +202,7 @@ def forward(
             # Prompt run.
             if kv_cache is None or attn_metadata.block_tables.numel() == 0:
                 # TODO: move this outside of model
-                assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!'
+                assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' 
                 attn_bias = attn_metadata.attn_bias
                 if self.alibi_slopes is not None and self.position_bias is not None:
                     attn_bias.add_(self.position_bias[:, :,
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index a040e187eb0da..16245f70ec4d5 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -8,7 +8,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async, HabanaMemoryProfiler, format_bytes)
+                        make_async, HabanaMemoryProfiler)
 import os
 import contextlib
 from vllm.worker.worker_base import WorkerWrapperBase
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 176b95b720615..2284012ecc7a0 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -78,8 +78,7 @@ def initialize_ray_cluster(
     else:
         ray.init(address=ray_address,
                  ignore_reinit_error=True,
-                 log_to_driver=not os.environ.get(
-                     'VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0')
+                 log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER", "0") == "0")
     ray_accel_name = "HPU" if is_hpu() else "GPU"
 
     if parallel_config.placement_group:
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 6457ad3c460f3..6f060b8280ea6 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
-from typing import Tuple
 import torch
 import habana_frameworks.torch as htorch
 
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 798bee09fda4f..875a54338913a 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -6,10 +6,9 @@
 ###############################################################################
 import os
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 import habana_frameworks.torch as htorch
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import vllm.hpu.utils as hpu_utils
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 520332110fd1f..72260a329f617 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -11,7 +11,6 @@
 import threading
 import uuid
 import warnings
-import importlib
 from collections import defaultdict
 from functools import lru_cache, partial, wraps
 from platform import uname
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 961b5689e43a4..2e6c374b1d51f 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -9,7 +9,7 @@
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available, is_hpu)
 if is_hpu():
-    import habana_frameworks.torch as htorch
+    pass
 
 logger = init_logger(__name__)
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e8e7c35579b0f..c8232c0cba407 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -16,7 +16,6 @@
 import operator
 import torch
 import habana_frameworks.torch as htorch
-import contextlib
 from vllm.attention import (AttentionMetadata, get_attn_backend)
 from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,

From 22ee71540b3c7beb9d243f38b8ce1de1f861655a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 18:58:45 +0300
Subject: [PATCH 071/341] line lengths fixes

---
 setup.py                                |  4 +++-
 vllm/attention/backends/habana_attn.py  |  6 +++--
 vllm/engine/async_llm_engine.py         |  3 ++-
 vllm/executor/habana_executor.py        | 32 ++++++++++++++-----------
 vllm/executor/ray_utils.py              | 23 +++++++++---------
 vllm/hpu/rotary_embed.py                | 30 ++++++++++++++---------
 vllm/model_executor/layers/layernorm.py |  3 ++-
 vllm/utils.py                           | 30 +++++++++++++----------
 vllm/worker/habana_worker.py            | 11 +++++----
 vllm/worker/profiler.py                 |  7 +++---
 10 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/setup.py b/setup.py
index ddf1cdf034c1b..897958d875284 100644
--- a/setup.py
+++ b/setup.py
@@ -207,7 +207,9 @@ def build_extensions(self) -> None:
 
 def _is_hpu() -> bool:
     is_hpu_available = True
-    return is_hpu_available  # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it.
+    # FIXME(kzawora): HPU autodetection sporadically fails on certain clients.
+    # Need to find the cause and fix it.
+    return is_hpu_available
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 2fdb3d4f9c382..4f34aed1c90f2 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -202,9 +202,11 @@ def forward(
             # Prompt run.
             if kv_cache is None or attn_metadata.block_tables.numel() == 0:
                 # TODO: move this outside of model
-                assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' 
+                assert attn_metadata.attn_bias is not None, \
+                       'attn_bias must be set before calling model.forward!'
                 attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None and self.position_bias is not None:
+                if self.alibi_slopes is not None and \
+                   self.position_bias is not None:
                     attn_bias.add_(self.position_bias[:, :,
                                                       -attn_bias.size(2):,
                                                       -attn_bias.size(3):])
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 55e0d337b4235..b85087739c7ee 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -396,7 +396,8 @@ def from_engine_args(
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync
+                from vllm.executor.ray_habana_executor import (
+                    RayHabanaExecutorAsync)
                 executor_class = RayHabanaExecutorAsync
             else:
                 from vllm.executor.habana_executor import HabanaExecutorAsync
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 16245f70ec4d5..06a214918f931 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -83,16 +83,16 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
 
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        logger.info(
-            f"init_cache_engine took {cache_init_m.get_summary_string()}")
+        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
+        logger.info(msg)
 
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS!
-        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any
-        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
         log_graph_compilation_all = os.environ.get(
             'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
         log_graph_compilation = os.environ.get(
@@ -117,25 +117,29 @@ def execute_model(
             ])  # whoa, that's some spicy stuff right here
             max_num_blocks = (
                 (max_context_len - 1) // self.cache_config.block_size) + 1
-            input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}'
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
             gc_ctx = metric_localcontext(
                 "graph_compilation"
             ) if log_graph_compilation else contextlib.nullcontext()
             cpu_fallback_ctx = metric_localcontext(
                 "cpu_fallback"
             ) if log_cpu_fallbacks else contextlib.nullcontext()
-            with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric:
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
                 output = self.driver_worker.execute_model(execute_model_req)
             if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
                 ) or log_graph_compilation_all:
-                logger.warning(
-                    f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}"
-                )
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
             if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
                     0) or log_cpu_fallbacks_all:
-                logger.warning(
-                    f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}"
-                )
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
 
             return output
 
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 2284012ecc7a0..7961efdf4b516 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -78,8 +78,9 @@ def initialize_ray_cluster(
     else:
         ray.init(address=ray_address,
                  ignore_reinit_error=True,
-                 log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER", "0") == "0")
-    ray_accel_name = "HPU" if is_hpu() else "GPU"
+                 log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER",
+                                              "0") == "0")
+    device = "HPU" if is_hpu() else "GPU"
 
     if parallel_config.placement_group:
         # Placement group is already set.
@@ -93,27 +94,25 @@ def initialize_ray_cluster(
         # Verify that we can use the placement group.
         gpu_bundles = 0
         for bundle in bundles:
-            bundle_gpus = bundle.get(ray_accel_name, 0)
+            bundle_gpus = bundle.get(device, 0)
             if bundle_gpus > 1:
                 raise ValueError(
-                    f"Placement group bundle cannot have more than 1 {ray_accel_name}."
+                    f"Placement group bundle cannot have more than 1 {device}."
                 )
             if bundle_gpus:
                 gpu_bundles += 1
         if parallel_config.world_size > gpu_bundles:
             raise ValueError(
-                f"The number of required {ray_accel_name}s exceeds the total number of "
-                f"available {ray_accel_name}s in the placement group.")
+                f"The number of required {device}s exceeds the total number of "
+                f"available {device}s in the placement group.")
     else:
-        num_gpus_in_cluster = ray.cluster_resources().get(ray_accel_name, 0)
+        num_gpus_in_cluster = ray.cluster_resources().get(device, 0)
         if parallel_config.world_size > num_gpus_in_cluster:
             raise ValueError(
-                f"The number of required {ray_accel_name}s exceeds the total number of "
-                f"available {ray_accel_name}s in the cluster.")
+                f"The number of required {device}s exceeds the total number of "
+                f"available {device}s in the cluster.")
         # Create a new placement group
-        placement_group_specs = ([{
-            ray_accel_name: 1
-        }] * parallel_config.world_size)
+        placement_group_specs = ([{device: 1}] * parallel_config.world_size)
         current_placement_group = ray.util.placement_group(
             placement_group_specs)
         # Wait until PG is ready - this will block until all
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 16c956acdf817..3c701df439535 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -29,7 +29,8 @@ def is_gaudi3():
 # TODO: remove this workaround when FusedRoPE properly works on Gaudi
 if not is_gaudi1() and (is_gaudi2() or is_gaudi3()):
     try:
-        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE
+        from habana_frameworks.torch.hpex.kernels import (
+            RotaryPosEmbeddingHelperV1 as FusedRoPE)
     except ImportError:
         print("Not using HPU fused kernel for apply_rotary_pos_emb")
         FusedRoPE = None
@@ -53,17 +54,23 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
         position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
+            The position indices of the tokens corresponding to the query and 
+            key tensors. For example, this can be used to pass offsetted 
+            position ids when working with a KV-cache.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+            The 'unsqueeze_dim' argument specifies the dimension along which to 
+            unsqueeze cos[position_ids] and sin[position_ids] so that they can 
+            be properly broadcasted to the dimensions of q and k. For example, 
+            note that cos[position_ids] and sin[position_ids] have the shape 
+            [batch_size, seq_len, head_dim]. Then, if q and k have the shape 
+            [batch_size, heads, seq_len, head_dim], then setting 
+            unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] 
+            broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set 
+            unsqueeze_dim=2.
     Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated 
+        using the Rotary Position Embedding.
     """
     cos = cos[position_ids]  #.unsqueeze(unsqueeze_dim)
     sin = sin[position_ids]  #.unsqueeze(unsqueeze_dim)
@@ -103,7 +110,8 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
                          dtype=self.inv_freq.dtype)
 
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # Different from paper, but it uses a different permutation in order
+        # to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached",
                              emb.cos().to(dtype),
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 975019bc9c24d..334c3d6c95c78 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -8,7 +8,8 @@
 from vllm.utils import is_hpu
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
+                                                                FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None
diff --git a/vllm/utils.py b/vllm/utils.py
index 72260a329f617..53ced00dda766 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -640,32 +640,38 @@ def get_summary_string(self):
         if getattr(self, 'final_device_memory', None) is None or getattr(
                 self, 'final_host_memory', None) is None:
             raise RuntimeError(
-                "HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager"
-            )
+                "HabanaMemoryProfiler.get_summary_string() can only be called "
+                "after closing context manager")
         return (
-            f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and "
-            f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)"
-        )
+            f"{format_bytes(self.consumed_device_memory)} of device memory "
+            f"({format_bytes(self.final_device_memory)}/"
+            f"({format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
+            f" and {format_bytes(self.consumed_host_memory)} of host memory "
+            f"({format_bytes(self.final_host_memory)}/"
+            f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
 
     def __enter__(self):
         # Force garbage collection
         gc.collect()
-        self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage(
-        )
-        self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
-        )
+        self.initial_device_memory = \
+            HabanaMemoryProfiler.current_device_memory_usage()
+        self.initial_host_memory = \
+            HabanaMemoryProfiler.current_host_memory_usage()
         # This allows us to call methods of the context manager if needed
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # Force garbage collection
         gc.collect()
-        self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage(
+        self.final_device_memory = \
+            HabanaMemoryProfiler.current_device_memory_usage(
         )
         self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
         )
-        self.consumed_device_memory = self.final_device_memory - self.initial_device_memory
-        self.consumed_host_memory = self.final_host_memory - self.initial_host_memory
+        self.consumed_device_memory = \
+            self.final_device_memory - self.initial_device_memory
+        self.consumed_host_memory = \
+            self.final_host_memory - self.initial_host_memory
 
 
 # Adapted from https://stackoverflow.com/a/49361727
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 8b53615805291..382962ce9ea71 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -70,7 +70,7 @@ def __init__(
         if self.vision_language_config:
             assert not self.lora_config, (
                 "To be tested: vision language model with LoRA settings.")
-            assert False, "To be tested: vision language model on HPU"
+            raise AssertionError("To be tested: vision language model on HPU")
 
         self.model_runner = HabanaModelRunner(
             model_config,
@@ -125,8 +125,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         self.model_runner.profile_run()
         torch.hpu.synchronize()
 
-        # At this point we should've allocated the maximum workspace for all recipes
-        # we will use the extra memory for graphs/blocks
+        # At this point we should've allocated the maximum workspace for all
+        # recipes we will use the extra memory for graphs/blocks
         free_hpu_memory = torch.hpu.mem_get_info()[0]
 
         cache_block_size = self.get_cache_block_size_bytes()
@@ -170,8 +170,9 @@ def _init_cache_engine(self) -> None:
                                         self.parallel_config,
                                         self.device_config)
         self.hpu_cache = self.cache_engine.gpu_cache
-        htorch.hpu.synchronize(
-        )  # we want to materialize cache tensors before we proceed with graph capture/execution
+        # we want to materialize cache tensors before we proceed with
+        # graph capture/execution
+        htorch.hpu.synchronize()
 
     def _warm_up_model(self) -> None:
         self.model_runner.warmup_model(self.hpu_cache)
diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
index 9e181e27bde34..d5125019a4b8c 100644
--- a/vllm/worker/profiler.py
+++ b/vllm/worker/profiler.py
@@ -58,7 +58,8 @@ def __init__(self):
         self.enabled = os.getenv('VLLM_PROFILER_ENABLED',
                                  'false').lower() == 'true' and int(
                                      os.getenv('RANK', '0')) == 0
-        logger.info(f'Profiler enabled for: {self.vllm_instance_id}')
+        msg = f'Profiler enabled for: {self.vllm_instance_id}'
+        logger.info(msg)
         if self.enabled:
             # initialize the trace file (JSON Array Format)
             with open(self.filename, 'w') as outfile:
@@ -107,8 +108,8 @@ def end(self):
             ts = self.get_timestamp_us()
             if not self.event_cache:
                 logger.warning(
-                    'Profiler: end() call does not have matching start() call. Disabling profiler.'
-                )
+                    'Profiler: end() call does not have matching start() call. '
+                    'Disabling profiler.')
                 self.enabled = False
                 return
             event = self.event_cache.pop()

From bf3187138253823fdb0f1cb23bd0cab04e423c76 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:12:36 +0300
Subject: [PATCH 072/341] more line fixes + isort

---
 vllm/attention/backends/habana_attn.py        |   2 +-
 vllm/config.py                                |   2 +-
 vllm/distributed/communication_op.py          |   3 +-
 vllm/executor/habana_executor.py              |   9 +-
 vllm/executor/ray_utils.py                    |   3 +-
 vllm/hpu/cache_ops.py                         |   2 +-
 vllm/hpu/ops.py                               |   5 +-
 vllm/hpu/rotary_embed.py                      |   2 +-
 vllm/model_executor/layers/layernorm.py       |   5 +-
 .../model_executor/layers/rotary_embedding.py |   3 +-
 vllm/model_executor/model_loader/loader.py    |   2 +-
 vllm/model_executor/models/mixtral.py         |   2 +-
 vllm/model_executor/sampling_metadata.py      |   4 +-
 vllm/worker/cache_engine.py                   |   5 +-
 vllm/worker/habana_model_runner.py            | 159 +++++++++++-------
 vllm/worker/habana_worker.py                  |   4 +-
 vllm/worker/profiler.py                       |   2 +-
 17 files changed, 125 insertions(+), 89 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 4f34aed1c90f2..dd33d3526e907 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -6,8 +6,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
-import vllm.hpu.ops as ops
 
+import vllm.hpu.ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
diff --git a/vllm/config.py b/vllm/config.py
index 38f8d4f2884d0..75a64732f8591 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -14,7 +14,7 @@
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu, is_hpu,
+                        is_hip, is_hpu, is_neuron, is_openvino, is_tpu, is_xpu,
                         print_warning_once, update_environment_variables)
 
 if TYPE_CHECKING:
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 2bb082385c0f3..63c159fce3d71 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -3,9 +3,10 @@
 import torch
 import torch.distributed
 
+from vllm.utils import is_hpu
+
 from .parallel_state import get_tp_group
 
-from vllm.utils import is_hpu
 if is_hpu():
     import habana_frameworks.torch as htorch
 
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 06a214918f931..b771b9e026970 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -2,15 +2,16 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
+import contextlib
+import os
 from typing import Any, Dict, List, Optional, Set, Tuple
+
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
-                        make_async, HabanaMemoryProfiler)
-import os
-import contextlib
+from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
+                        get_ip, get_open_port, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7961efdf4b516..ea57bc842c8e7 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -1,6 +1,7 @@
+import os
 import pickle
 from typing import List, Optional, Tuple
-import os
+
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.utils import get_ip, is_hip, is_hpu, is_xpu
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 6f060b8280ea6..d28a47271c6ac 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -5,8 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
-import torch
 import habana_frameworks.torch as htorch
+import torch
 
 
 def reshape_and_cache(key,
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 875a54338913a..bd737917cb919 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -5,10 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 import os
+from typing import Optional
+
+import habana_frameworks.torch as htorch
 import torch
 import torch.nn.functional as F
-import habana_frameworks.torch as htorch
-from typing import Optional
 
 import vllm.hpu.utils as hpu_utils
 
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 3c701df439535..26b19e8258285 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -5,9 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
+import habana_frameworks.torch.utils.experimental as htexp
 import torch
 import torch.nn as nn
-import habana_frameworks.torch.utils.experimental as htexp
 
 
 def get_device_type():
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 334c3d6c95c78..57ada2ba8e3c4 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,10 +6,11 @@
 
 from vllm.model_executor.custom_op import CustomOp
 from vllm.utils import is_hpu
+
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
-                                                                FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (
+            FusedRMSNorm as FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ceaa2ddd3d553..d706c70c82374 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -28,7 +28,8 @@
 import torch.nn as nn
 
 from vllm.model_executor.custom_op import CustomOp
-from vllm.utils import is_tpu, is_hpu
+from vllm.utils import is_hpu, is_tpu
+
 if is_hpu():
     from vllm.hpu.rotary_embed import HpuRotaryEmbedding
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index ad146da72fb26..dc16897196601 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.models.interfaces import (supports_lora,
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.utils import get_device_capability_stateless, is_tpu, is_hpu
+from vllm.utils import get_device_capability_stateless, is_hpu, is_tpu
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0344f1c7c7a03..07edd4711e04f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -52,7 +52,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
-from vllm.utils import print_warning_once, is_hpu
+from vllm.utils import is_hpu, print_warning_once
 
 if is_hpu():
     from vllm.hpu.ops import static_fused_moe
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index ea82a3a4041b7..a916d8596d2f0 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -7,8 +7,8 @@
 from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
-from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        maybe_expand_dim, is_hpu)
+from vllm.utils import (async_tensor_h2d, is_hpu, is_pin_memory_available,
+                        maybe_expand_dim)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 2e6c374b1d51f..1c185e9b3a405 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,8 +6,9 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
-                        is_pin_memory_available, is_hpu)
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_hpu,
+                        is_pin_memory_available)
+
 if is_hpu():
     pass
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index c8232c0cba407..22d6368bacacc 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -4,20 +4,21 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-import time
-from enum import IntEnum
-from typing import List, NamedTuple, Optional, Set, Tuple, Dict
-
 import collections
 import gc
-import os
-import math
 import itertools
+import math
 import operator
-import torch
+import os
+import time
+from enum import IntEnum
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple
+
 import habana_frameworks.torch as htorch
-from vllm.attention import (AttentionMetadata, get_attn_backend)
-from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig,
+import torch
+
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
@@ -30,8 +31,8 @@
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available,
-                        make_tensor_with_pad, format_bytes)
+from vllm.utils import (HabanaMemoryProfiler, format_bytes,
+                        is_pin_memory_available, make_tensor_with_pad)
 
 from .profiler import Profiler
 
@@ -93,10 +94,10 @@ def find_bucket(value: int, config: Tuple[int, int, int]):
     return result
 
 
-def subtuple(obj: object,
-             typename: str,
-             to_copy: List[str],
-             to_override: Dict[str, object] = {}):
+def subtuple(obj: object, typename: str, to_copy: List[str],
+             to_override: Optional[Dict[str, object]]):
+    if to_override is None:
+        to_override = {}
     if obj is None:
         return None
     fields = set(to_copy) | set(to_override.keys())
@@ -261,7 +262,8 @@ def __init__(
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
         self.max_model_len = self.scheduler_config.max_model_len
-        self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_num_batched_tokens = \
+            self.scheduler_config.max_num_batched_tokens
         self.block_size = cache_config.block_size
 
         self.pin_memory = is_pin_memory_available()
@@ -299,19 +301,21 @@ def load_model(self) -> None:
                     parallel_config=self.parallel_config,
                     scheduler_config=self.scheduler_config,
                     cache_config=self.cache_config)
-            logger.info(
-                f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}"
-            )
+            msg = ("Pre-loading model weights on "
+                   f"{next(self.model.parameters()).device} "
+                   f"took {m_getmodel.get_summary_string()}")
+            logger.info(msg)
 
-            # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged
+            # FIXME: Running with disable_tensor_cache=True causes
+            # RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = _maybe_wrap_in_hpu_graph(self.model)
-            logger.info(
-                f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}")
+            msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
+            logger.info(msg)
 
         self.model_memory_usage = m.consumed_device_memory
-        logger.info(
-            f"Loading model weights took in total {m.get_summary_string()}")
+        msg = f"Loading model weights took in total {m.get_summary_string()}"
+        logger.info(msg)
 
         if self.lora_config:
             assert hasattr(self.model, "supported_lora_modules"
@@ -359,23 +363,26 @@ def _setup_buckets(self) -> None:
                                                           max=2048)
         self.graphed_buckets = set()
 
-        logger.info(
-            f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}"
-        )
+        msg = ("Prompt bucket config (min, step, max_warmup) "
+               f"bs:{self.prompt_bs_bucket_cfg}, "
+               f"seq:{self.prompt_seq_bucket_cfg}")
+        logger.info(msg)
         self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg,
                                              self.prompt_seq_bucket_cfg)
-        logger.info(
-            f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}"
-        )
 
-        logger.info(
-            f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}"
-        )
+        msg = (f"Generated {len(self.prompt_buckets)} "
+               f"prompt buckets: {list(sorted(self.prompt_buckets))}")
+        logger.info(msg)
+
+        msg = ("Decode bucket config (min, step, max_warmup) "
+               f"bs:{self.decode_bs_bucket_cfg}, "
+               f"seq:{self.decode_seq_bucket_cfg}")
+        logger.info(msg)
         self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
                                              self.decode_seq_bucket_cfg)
-        logger.info(
-            f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}"
-        )
+        msg = ("Generated {len(self.decode_buckets)} decode buckets: "
+               f"{list(sorted(self.decode_buckets))}")
+        logger.info(msg)
 
     def _prepare_prompt(
         self,
@@ -733,7 +740,9 @@ def prepare_input_tensors(
             num_prefill_tokens = len(input_tokens)
             num_decode_tokens = len(decode_input_tokens)
 
-            # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing.
+            # NOTE(kzawora): Here we diverge from GPU code - we don't
+            # support mixed batches, so we either use decode or prefill
+            # inputs, without coalescing.
             assert (num_prefills == 0 and num_decode_tokens > 0) or (
                 num_prefills > 0 and num_decode_tokens
                 == 0), "HPU does not support mixed batches!"
@@ -745,7 +754,8 @@ def prepare_input_tensors(
                 lora_prompt_mapping = decode_lora_prompt_mapping
                 lora_requests = decode_lora_requests
 
-            # FIXME: We need to adjust selected_token_indices to accommodate for padding
+            # FIXME: We need to adjust selected_token_indices to accommodate f
+            # or padding
             max_len = input_tokens.size(1)
             paddings = [max_len - s for s in seq_lens]
             paddings = [0] + paddings[:-1]
@@ -836,7 +846,8 @@ def prepare_input_tensors(
                 decode_attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
 
-        attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata
+        attn_metadata = prefill_attn_metadata if \
+            prefill_attn_metadata is not None else decode_attn_metadata
         #        attn_metadata = AttentionMetadata(
         #            num_prefills=num_prefills,
         #            slot_mapping=slot_mapping,
@@ -897,7 +908,8 @@ def execute_model(
             self.profiler.start('internal', base_event_name)
 
             real_batch_size = len(seq_group_metadata_list)
-            bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg
+            bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \
+                self.decode_bs_bucket_cfg
             batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
             batch_size_padding = batch_size_padded - real_batch_size
             seq_group_metadata_list = seq_group_metadata_list.copy()
@@ -926,7 +938,11 @@ def execute_model(
 
         htorch.core.mark_step()
         if self.is_driver_worker:
-            model_event_name = f"model_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}"
+            model_event_name = ("model_"
+                                f"{'prompt' if is_prompt else 'decode'}_"
+                                f"bs{batch_size}_"
+                                f"seq{seq_len}_"
+                                f"graphs{'T' if use_graphs else 'F'}")
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
@@ -938,9 +954,10 @@ def execute_model(
 
         # Compute the logits.
         with self.profiler.record_event(
-                'internal',
-                f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'
-        ):
+                'internal', ('compute_logits_'
+                             f'{"prompt" if is_prompt else "decode"}_bs'
+                             f'{batch_size}_'
+                             f'seq{seq_len}')):
             sampling_metadata.selected_token_indices = None
             logits = self.model.compute_logits(hidden_states,
                                                sampling_metadata)
@@ -952,9 +969,10 @@ def execute_model(
 
         # Sample the next token.
         with self.profiler.record_event(
-                'internal',
-                f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'
-        ):
+                'internal', ('sample_'
+                             f'{"prompt" if is_prompt else "decode"}_'
+                             f'bs{batch_size}_'
+                             f'seq{seq_len}')):
             output = self.model.sample(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
@@ -1012,7 +1030,11 @@ def profile_run(self) -> None:
     def warmup_scenario(self, batch_size, seq_len, is_prompt,
                         kv_caches) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
-        scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}"
+        scenario_name = ("warmup_"
+                         f"{'prompt' if is_prompt else 'decode'}_"
+                         f"bs{batch_size}_"
+                         f"seq{seq_len}_"
+                         f"graphs{'T' if use_graphs else 'F'}")
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs else 1
         seqs = [
@@ -1029,14 +1051,14 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
     def log_warmup(self, phase, i, max_i, batch_size, seq_len):
         free_mem = format_bytes(
             HabanaMemoryProfiler.current_free_device_memory())
-        logger.info(
-            f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}"
-        )
+        msg = (f"[Warmup][{phase}][{i+1}/{max_i}] "
+               f"batch_size:{batch_size} "
+               f"seq_len:{seq_len} "
+               f"free_mem:{free_mem}")
+        logger.info(msg)
 
     def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
         for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
-            mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage(
-            ) / HabanaMemoryProfiler.total_device_memory()
             self.log_warmup('Prompt' if is_prompt else 'Decode', i,
                             len(buckets), batch_size, seq_len)
             self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
@@ -1075,9 +1097,11 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
             total_batch_seq += batch_seq
         graphed = list(c[:2] for c in self.graphed_buckets
                        if c[2] == is_prompt)
-        logger.info(
-            f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}'
-        )
+        msg = (f'{phase} captured:{len(graphed)} '
+               f'({100 * len(graphed) / num_candidates:.1f}%) '
+               f'used_mem:{format_bytes(total_mem)} '
+               f'buckets:{sorted(list(graphed))}')
+        logger.info(msg)
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
@@ -1093,8 +1117,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         if not self.enforce_eager:
             mem_margin = 1.0 - float(
                 os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
-            free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory(
-            )
+            free_mem = \
+                mem_margin * HabanaMemoryProfiler.current_free_device_memory()
             free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN)
             prompt_graph_mem_ratio = float(
                 os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
@@ -1111,9 +1135,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()
         elapsed_time = end_time - start_time
-        logger.info(
-            f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory"
-        )
+        msg = (
+            f"Warmup finished in {elapsed_time:.0f} secs, "
+            f"allocated {format_bytes(end_mem - start_mem)} of device memory")
+        logger.info(msg)
         self.profiler.end()
 
     @property
@@ -1180,14 +1205,17 @@ def get_counter_dict(self, cache_config, duration, seq_len,
             counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
 
         # KV cache might not be created yet (e.g. for profiling run)
-        if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0:
+        if cache_config.num_gpu_blocks is not None and \
+            cache_config.num_gpu_blocks != 0:
             cache_num_blocks_used = [
                 math.ceil(sl / cache_config.block_size) for sl in real_seq_lens
             ]
             cache_total_num_blocks_used = sum(cache_num_blocks_used)
             num_cache_blocks = cache_config.num_gpu_blocks
-            cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used
-            cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks
+            cache_total_num_free_blocks = \
+                num_cache_blocks - cache_total_num_blocks_used
+            cache_computed_utilization = \
+                cache_total_num_blocks_used / num_cache_blocks
             max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size)
             batch_block_utilization = cache_total_num_blocks_used / (
                 batch_size_padded * max_blocks_per_seq)
@@ -1199,7 +1227,8 @@ def get_counter_dict(self, cache_config, duration, seq_len,
         if not self.logged_once:
             counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks
             counters[
-                'const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization
+                'const_gpu_memory_utilization'] = \
+                    cache_config.gpu_memory_utilization
             counters['const_block_size'] = cache_config.block_size
             self.logged_once = True
         return counters
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 382962ce9ea71..63055bf4f2055 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -6,13 +6,13 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
-import torch
 import habana_frameworks.torch as htorch
+import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig, SpeculativeConfig)
+                         SpeculativeConfig, VisionLanguageConfig)
 from vllm.distributed import (broadcast_tensor_dict,
                               ensure_model_parallel_initialized,
                               init_distributed_environment)
diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
index d5125019a4b8c..48348de41f520 100644
--- a/vllm/worker/profiler.py
+++ b/vllm/worker/profiler.py
@@ -8,10 +8,10 @@
 import threading
 import time
 from contextlib import contextmanager
+from typing import Any, List
 
 from vllm.logger import init_logger
 from vllm.utils import get_vllm_instance_id
-from typing import List, Any
 
 logger = init_logger(__name__)
 

From b7d34afe72edcfb26d9556a459f1f22ddf5a0601 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:19:28 +0300
Subject: [PATCH 073/341] tiny fixes

---
 vllm/attention/backends/habana_attn.py  | 3 ++-
 vllm/model_executor/layers/layernorm.py | 4 ++--
 vllm/worker/habana_model_runner.py      | 9 +++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index dd33d3526e907..7db5ab2eeeeff 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -144,6 +144,7 @@ def __init__(
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
         self.position_bias = None
+        self.alibi_slopes = alibi_slopes
         if alibi_slopes is not None:
             alibi_slopes_tensor = torch.tensor(alibi_slopes,
                                                dtype=torch.bfloat16)
@@ -151,7 +152,7 @@ def __init__(
                                                   num_kv_heads,
                                                   alibi_slopes_tensor.dtype,
                                                   max_seq_len)
-        self.alibi_slopes = alibi_slopes_tensor
+            self.alibi_slopes = alibi_slopes_tensor
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 57ada2ba8e3c4..8c45abf38da2d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,8 +9,8 @@
 
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (
-            FusedRMSNorm as FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
+                                                                FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 22d6368bacacc..58ec7302c1bc4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -94,8 +94,10 @@ def find_bucket(value: int, config: Tuple[int, int, int]):
     return result
 
 
-def subtuple(obj: object, typename: str, to_copy: List[str],
-             to_override: Optional[Dict[str, object]]):
+def subtuple(obj: object,
+             typename: str,
+             to_copy: List[str],
+             to_override: Optional[Dict[str, object]] = None):
     if to_override is None:
         to_override = {}
     if obj is None:
@@ -1230,5 +1232,4 @@ def get_counter_dict(self, cache_config, duration, seq_len,
                 'const_gpu_memory_utilization'] = \
                     cache_config.gpu_memory_utilization
             counters['const_block_size'] = cache_config.block_size
-            self.logged_once = True
-        return counters
+            self.logged_once = Tru
\ No newline at end of file

From f1eee8d4005df3a56382835ca9168c8313be1cd7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:19:44 +0300
Subject: [PATCH 074/341] more tiny fixes

---
 vllm/model_executor/layers/layernorm.py | 4 ++--
 vllm/worker/habana_model_runner.py      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8c45abf38da2d..57ada2ba8e3c4 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,8 +9,8 @@
 
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
-                                                                FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (
+            FusedRMSNorm as FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 58ec7302c1bc4..08dad66df1e76 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1232,4 +1232,5 @@ def get_counter_dict(self, cache_config, duration, seq_len,
                 'const_gpu_memory_utilization'] = \
                     cache_config.gpu_memory_utilization
             counters['const_block_size'] = cache_config.block_size
-            self.logged_once = Tru
\ No newline at end of file
+            self.logged_once = True
+        return counters

From 6f2f964c2501f9d0d23f46a702239c1bbb3e9aca Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:21:03 +0300
Subject: [PATCH 075/341] ??

---
 vllm/model_executor/layers/layernorm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 57ada2ba8e3c4..8c45abf38da2d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,8 +9,8 @@
 
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (
-            FusedRMSNorm as FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
+                                                                FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None

From d932e8dcbd6e10750afb1f344262b56c382797c4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:24:07 +0300
Subject: [PATCH 076/341] can this finally work

---
 vllm/model_executor/layers/layernorm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8c45abf38da2d..57ada2ba8e3c4 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,8 +9,8 @@
 
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
-                                                                FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (
+            FusedRMSNorm as FusedRMSNorm)
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None

From 4431ed63bf77780df6b7a47c540fe6f8dea0f67e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:26:37 +0300
Subject: [PATCH 077/341] no, it did in fact not work

---
 vllm/model_executor/layers/layernorm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 57ada2ba8e3c4..67cef1b47f3bf 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,8 +9,7 @@
 
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (
-            FusedRMSNorm as FusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
     except ImportError:
         print("Not using HPU fused kernel for RMSNorm")
         FusedRMSNorm = None

From 262356b21cb40d30b868ad7dcd400b02ab5b40bb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:27:57 +0300
Subject: [PATCH 078/341] fix typos

---
 README_GAUDI.md                                    | 2 +-
 docs/source/getting_started/gaudi-installation.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 3b72ad71069c4..1a1b2d9cc6e36 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -132,7 +132,7 @@ If you experience device out-of-memory issues or want to attempt inference at hi
    cache blocks you have available, and therefore reduces the effective
    maximum number of tokens you can handle at a given time.
 
--  If this methon is not efficient, you can disable `HPUGraph` completely. With
+-  If this method is not efficient, you can disable `HPUGraph` completely. With
    HPU Graphs disabled, you are trading latency and throughput at lower
    batches for potentially higher throughput on higher batches. You can do
    that by adding `--enforce-eager` flag to server (for
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 90f97155e1d75..73b63b3f8d755 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -136,7 +136,7 @@ the below:
    cache blocks you have available, and therefore reduces the effective
    maximum number of tokens you can handle at a given time.
 
--  If this methon is not efficient, you can disable ``HPUGraph``
+-  If this method is not efficient, you can disable ``HPUGraph``
    completely. With HPU Graphs disabled, you are trading latency and
    throughput at lower batches for potentially higher throughput on
    higher batches. You can do that by adding ``--enforce-eager`` flag to

From 962c91dff4a88323f9b91f3b212e7461f9a38725 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 19:46:17 +0300
Subject: [PATCH 079/341] fix some mypy issues in habana model runner:

---
 vllm/worker/habana_model_runner.py | 40 +++++++++++++++---------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 08dad66df1e76..7effebd18f363 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
@@ -12,7 +10,7 @@
 import os
 import time
 from enum import IntEnum
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple
+from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Any
 
 import habana_frameworks.torch as htorch
 import torch
@@ -48,7 +46,7 @@
 # dim is either 'bs' or 'seq'
 # param is either 'min', 'step' or 'max'
 # example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
-def read_bucket_settings(phase: str, dim: str, **defaults: Dict):
+def read_bucket_settings(phase: str, dim: str, **defaults):
     params = ['min', 'step', 'max']
     values = [
         int(
@@ -61,10 +59,11 @@ def read_bucket_settings(phase: str, dim: str, **defaults: Dict):
 def warmup_range(config: Tuple[int, int, int]):
     bmin, bstep, bmax = config
     base = itertools.repeat(2)
-    ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin)
-    ramp_up = itertools.takewhile(lambda x: x < bstep and x <= bmax, ramp_up)
+    ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
+    ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
+        ramp_up_acc)
     stable = range(bstep, bmax + 1, bstep)
-    return list(ramp_up) + list(stable)
+    return list(ramp_up_tw) + list(stable)
 
 
 def warmup_buckets(bs_bucket_config, seq_bucket_config):
@@ -172,16 +171,16 @@ def sample(self, *args, **kwargs):
 
 
 class PreparePromptMetadata(NamedTuple):
-    input_tokens: List[int]
-    input_positions: List[int]
+    input_tokens: List[List[int]]
+    input_positions: List[List[int]]
     attn_metadata: Optional[AttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
-    lora_index_mapping: List[int]
-    lora_prompt_mapping: List[int]
+    lora_index_mapping: List[List[int]]
+    lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
     multi_modal_input: Optional[torch.Tensor]
-    slot_mapping: List[int]
+    slot_mapping: List[List[int]]
 
     @classmethod
     def empty(cls):
@@ -200,13 +199,13 @@ def empty(cls):
 
 
 class PrepareDecodeMetadata(NamedTuple):
-    input_tokens: List[int]
-    input_positions: List[int]
+    input_tokens: List[List[int]]
+    input_positions: List[List[int]]
     attn_metadata: Optional[AttentionMetadata]
     lora_index_mapping: List[int]
     lora_prompt_mapping: List[int]
     lora_requests: Set[LoRARequest]
-    slot_mapping: List[int]
+    slot_mapping: List[List[int]]
 
     @classmethod
     def empty(cls):
@@ -363,7 +362,7 @@ def _setup_buckets(self) -> None:
                                                           min=self.block_size,
                                                           step=self.block_size,
                                                           max=2048)
-        self.graphed_buckets = set()
+        self.graphed_buckets: Set[Any] = set()
 
         msg = ("Prompt bucket config (min, step, max_warmup) "
                f"bs:{self.prompt_bs_bucket_cfg}, "
@@ -756,8 +755,8 @@ def prepare_input_tensors(
                 lora_prompt_mapping = decode_lora_prompt_mapping
                 lora_requests = decode_lora_requests
 
-            # FIXME: We need to adjust selected_token_indices to accommodate f
-            # or padding
+            # FIXME: We need to adjust selected_token_indices to accommodate
+            # for padding
             max_len = input_tokens.size(1)
             paddings = [max_len - s for s in seq_lens]
             paddings = [0] + paddings[:-1]
@@ -923,8 +922,9 @@ def execute_model(
              ) = self.prepare_input_tensors(seq_group_metadata_list)
             is_prompt = attn_metadata.is_prompt
 
-        if self.lora_config:
-            self.set_active_loras(lora_requests, lora_mapping)
+        # NOTE(kzawora): Need to restore this after adding LoRA
+        # if self.lora_config:
+        #    self.set_active_loras(lora_requests, lora_mapping)
 
         batch_size = input_tokens.size(0)
         seq_len = self._seq_len(attn_metadata)

From eb1ee27915ba6c4f887449cc10962d9652cc9efd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 2 Jul 2024 20:00:29 +0300
Subject: [PATCH 080/341] re-enable mypy for habana model runner

---
 vllm/worker/habana_model_runner.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 7effebd18f363..fe574b4bdefa0 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -10,7 +10,8 @@
 import os
 import time
 from enum import IntEnum
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Any
+from typing import (Any, Callable, Dict, List, NamedTuple, Optional, Set,
+                    Tuple, Union)
 
 import habana_frameworks.torch as htorch
 import torch
@@ -171,7 +172,7 @@ def sample(self, *args, **kwargs):
 
 
 class PreparePromptMetadata(NamedTuple):
-    input_tokens: List[List[int]]
+    input_tokens: torch.Tensor
     input_positions: List[List[int]]
     attn_metadata: Optional[AttentionMetadata]
     seq_lens: List[int]
@@ -199,11 +200,11 @@ def empty(cls):
 
 
 class PrepareDecodeMetadata(NamedTuple):
-    input_tokens: List[List[int]]
+    input_tokens: torch.Tensor
     input_positions: List[List[int]]
     attn_metadata: Optional[AttentionMetadata]
-    lora_index_mapping: List[int]
-    lora_prompt_mapping: List[int]
+    lora_index_mapping: List[List[int]]
+    lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
     slot_mapping: List[List[int]]
 
@@ -603,8 +604,8 @@ def _prepare_decode(
         slot_mapping: List[List[int]] = []
         seq_lens: List[int] = []
         block_tables: List[List[int]] = []
-        lora_index_mapping: List[int] = []
-        lora_prompt_mapping: List[int] = []
+        lora_index_mapping: List[List[int]] = []
+        lora_prompt_mapping: List[List[int]] = []
         lora_requests: Set[LoRARequest] = set()
 
         if len(seq_group_metadata_list) == 0:
@@ -903,6 +904,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
             event_start = self.profiler.get_timestamp_us()
             is_prompt = seq_group_metadata_list[0].is_prompt
             base_event_name = 'prompt' if is_prompt else 'decode'
@@ -917,6 +919,7 @@ def execute_model(
             seq_group_metadata_list.extend(seq_group_metadata_list[0]
                                            for _ in range(batch_size_padding))
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
+            assert seq_group_metadata_list is not None
             (input_tokens, input_positions, attn_metadata, sampling_metadata,
              lora_requests, lora_mapping, multi_modal_input
              ) = self.prepare_input_tensors(seq_group_metadata_list)
@@ -1072,7 +1075,8 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
         idx = 0
         phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
         num_candidates = len(buckets)
-
+        ordering : Union[Callable[[Any], Tuple[Any, Any]], \
+            Callable[[Any], Tuple[Any, Any, Any]]]
         if strategy == 'min_tokens':
             ordering = lambda b: (b[0] * b[1], b[1], b[0])
         elif strategy == 'max_bs':

From 6a5effbeee92725aad09ab57afd30a5511e4a5e0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 3 Jul 2024 18:59:15 +0300
Subject: [PATCH 081/341] habana components refactor

---
 vllm/attention/backends/habana_attn.py        |   4 +-
 vllm/executor/ray_habana_executor.py          | 119 ++-
 vllm/model_executor/layers/fused_moe/layer.py |   8 +-
 vllm/model_executor/model_loader/loader.py    |   2 +-
 vllm/model_executor/models/mixtral.py         |   6 +-
 vllm/worker/habana_model_runner.py            | 704 +++++++++++-------
 vllm/worker/habana_worker.py                  | 176 ++---
 vllm/worker/model_runner_base.py              |  11 +-
 8 files changed, 592 insertions(+), 438 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 7db5ab2eeeeff..98c16fdca4c3f 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -24,8 +24,8 @@ def get_impl_cls() -> Type["HabanaAttentionImpl"]:
         return HabanaAttentionImpl
 
     @staticmethod
-    def make_metadata(*args, **kwargs) -> "HabanaAttentionMetadata":
-        return HabanaAttentionMetadata(*args, **kwargs)
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return HabanaAttentionMetadata
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index b57536436bd49..9f57de8d2e060 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -11,7 +11,8 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
 if ray is not None:
@@ -28,9 +29,6 @@
 class RayHabanaExecutor(DistributedGPUExecutor):
 
     def _init_executor(self) -> None:
-        assert (not self.speculative_config
-                ), "Speculative decoding not yet supported for RayGPU backend."
-
         assert self.parallel_config.distributed_executor_backend == "ray"
         placement_group = self.parallel_config.placement_group
 
@@ -48,7 +46,8 @@ def _init_executor(self) -> None:
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if self.parallel_config.tensor_parallel_size == 1:
+        if (self.parallel_config.tensor_parallel_size == 1
+                and self.parallel_config.pipeline_parallel_size == 1):
             # For single GPU case, we use a ray worker with constrained memory.
             num_gpus = self.cache_config.gpu_memory_utilization
         else:
@@ -112,6 +111,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
             node_workers[node_id].append(i)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
             node_gpus[node_id].extend(gpu_ids)
         for node_id, gpu_ids in node_gpus.items():
             node_gpus[node_id] = sorted(gpu_ids)
@@ -128,9 +133,21 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
 
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
+        error_on_invalid_device_count_status()
+
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(
@@ -146,10 +163,29 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                           max_concurrent_workers=self.parallel_config.
                           max_parallel_loading_workers)
 
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+
+        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
+            for tp_rank in range(self.parallel_config.tensor_parallel_size):
+                rank = (pp_rank *
+                        self.parallel_config.tensor_parallel_size) + tp_rank
+                if rank == 0:
+                    pass
+                elif rank % self.parallel_config.tensor_parallel_size == 0:
+                    self.tp_driver_workers.append(self.workers[rank - 1])
+                else:
+                    self.non_driver_workers.append(self.workers[rank - 1])
+
     def _driver_execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
+        self, execute_model_req: Optional[ExecuteModelRequest]
+    ) -> Optional[List[SamplerOutput]]:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
@@ -162,7 +198,7 @@ def _run_workers(
         self,
         method: str,
         *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
         all_args: Optional[List[Tuple[Any, ...]]] = None,
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
@@ -173,10 +209,11 @@ def _run_workers(
         """Runs the given method on all workers. Can be used in the following
         ways:
 
-        - async_run_remote_workers_only: If True the method will be run only
-          in the remote workers, not the driver worker. It will also be
-          run asynchronously and return a list of futures rather than blocking
-          on the results.
+        Args:
+        - async_run_tensor_parallel_workers_only: If True the method will be
+          run only in the remote TP workers, not the driver worker.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
         - args/kwargs: All workers share the same args/kwargs
         - all_args/all_kwargs: args/kwargs for each worker are specified
           individually
@@ -186,7 +223,9 @@ def _run_workers(
             raise NotImplementedError(
                 "max_concurrent_workers is not supported yet.")
 
-        count = len(self.workers)
+        count = len(self.workers) if not \
+            async_run_tensor_parallel_workers_only \
+            else len(self.non_driver_workers)
         all_worker_args = repeat(args, count) if all_args is None \
             else islice(all_args, 1, None)
         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
@@ -200,14 +239,17 @@ def _run_workers(
             ray_worker_outputs = []
         else:
             # Start the ray workers first.
+            ray_workers = self.workers
+            if async_run_tensor_parallel_workers_only:
+                ray_workers = self.non_driver_workers
             ray_worker_outputs = [
                 worker.execute_method.remote(method, *worker_args,
                                              **worker_kwargs)
                 for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+                     ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
             ]
 
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
             # Just return futures
             return ray_worker_outputs
 
@@ -254,7 +296,7 @@ def _compiled_ray_dag(self):
                              f"required, but found {current_version}")
 
         from ray.dag import InputNode, MultiOutputNode
-        assert self.parallel_config.worker_use_ray
+        assert self.parallel_config.distributed_executor_backend == "ray"
 
         # Right now, compiled DAG requires at least 1 arg. We send
         # a dummy value for now. It will be fixed soon.
@@ -266,23 +308,6 @@ def _compiled_ray_dag(self):
             ])
         return forward_dag.experimental_compile()
 
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
-
-    def _check_if_any_actor_is_dead(self):
-        if not self.workers:
-            return
-
-        dead_actors = []
-        for actor in self.workers:
-            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
-            if actor_state["State"] == "DEAD":
-                dead_actors.append(actor)
-        if dead_actors:
-            raise RuntimeError("At least one Worker is dead. "
-                               f"Dead Workers: {dead_actors}. ")
-
 
 class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
 
@@ -294,12 +319,32 @@ async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
-                                             execute_model_req)
+
+        async def _run_task_with_lock(task, lock, *args, **kwargs):
+            async with lock:
+                return await task(*args, **kwargs)
+
+        tasks = []
+        tasks.append(
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req)))
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+
+        results = await asyncio.gather(*tasks)
+
+        # Only the last PP stage has the final results.
+        return results[-1]
 
     async def _start_worker_execution_loop(self):
         coros = [
             worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
+            for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 73cfcd7fc85f2..4641b7958f671 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -11,6 +11,10 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import is_hpu
+
+if is_hpu():
+    from vllm.hpu.ops import static_fused_moe
 
 logger = init_logger(__name__)
 
@@ -64,7 +68,9 @@ def apply(self,
               router_logits: torch.Tensor,
               top_k: int,
               renormalize: bool = True) -> torch.Tensor:
-
+        if is_hpu():
+            return static_fused_moe(x, layer.w13_weight, layer.w2_weight,
+                                    router_logits, top_k)
         return fused_moe(x,
                          layer.w13_weight,
                          layer.w2_weight,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 65af5e1919de0..c808f5c9f75b9 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -36,7 +36,7 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_tpu, is_hpu
+from vllm.utils import is_hpu, is_tpu
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0c35258ea202b..94cdf54f1d56d 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -45,10 +45,8 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import print_warning_once, is_hpu
+from vllm.utils import is_hpu, print_warning_once
 
-if is_hpu():
-    from vllm.hpu.ops import static_fused_moe
 from .interfaces import SupportsLoRA
 
 
@@ -99,7 +97,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
         if is_hpu():
-            return final_hidden_states.view(batch_size, sequence_length, 
+            return final_hidden_states.view(batch_size, sequence_length,
                                             hidden_size)
         return final_hidden_states.view(num_tokens, hidden_size)
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index fe574b4bdefa0..f3e2e976c1c5d 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -3,6 +3,7 @@
 ###############################################################################
 
 import collections
+import dataclasses
 import gc
 import itertools
 import math
@@ -10,8 +11,8 @@
 import os
 import time
 from enum import IntEnum
-from typing import (Any, Callable, Dict, List, NamedTuple, Optional, Set,
-                    Tuple, Union)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+                    Optional, Set, Tuple, Type, TypeVar, Union)
 
 import habana_frameworks.torch as htorch
 import torch
@@ -20,7 +21,6 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
-from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -29,12 +29,22 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
 from vllm.utils import (HabanaMemoryProfiler, format_bytes,
                         is_pin_memory_available, make_tensor_with_pad)
+from vllm.worker.model_runner_base import (
+    ModelRunnerBase, ModelRunnerInputBase,
+    _add_attn_metadata_broadcastable_dict,
+    _add_sampling_metadata_broadcastable_dict,
+    _init_attn_metadata_from_tensor_dict,
+    _init_sampling_metadata_from_tensor_dict)
 
 from .profiler import Profiler
 
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
 logger = init_logger(__name__)
 
 _PAD_SLOT_ID = 0
@@ -231,7 +241,97 @@ class BatchType(IntEnum):
     MIXED = 2
 
 
-class HabanaModelRunner:
+TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU")
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForHPU(ModelRunnerInputBase):
+    """
+    This base class contains metadata needed for the base model forward pass
+    but not metadata for possible additional steps, e.g., sampling. Model
+    runners that run additional steps should subclass this method to add
+    additional fields.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    seq_lens: Optional[List[int]] = None
+    query_lens: Optional[List[int]] = None
+    lora_mapping: Optional["LoRAMapping"] = None
+    lora_requests: Optional[Set[LoRARequest]] = None
+    attn_metadata: Optional["AttentionMetadata"] = None
+    multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None
+    real_batch_size: Optional[int] = None
+    batch_size_padded: Optional[int] = None
+    virtual_engine: int = 0
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+            "real_batch_size": self.real_batch_size,
+            "batch_size_padded": self.batch_size_padded,
+            "virtual_engine": self.virtual_engine
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type[TModelInputForHPU],
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> TModelInputForHPU:
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU):
+    """
+    Used by the ModelRunner.
+    """
+    sampling_metadata: Optional["SamplingMetadata"] = None
+    # Used for speculative decoding. We do not broadcast it because it is only
+    # used by the driver worker.
+    is_prompt: Optional[bool] = None
+
+    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "lora_requests": self.lora_requests,
+            "lora_mapping": self.lora_mapping,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
+        }
+        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
+        _add_sampling_metadata_broadcastable_dict(tensor_dict,
+                                                  self.sampling_metadata)
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls,
+        tensor_dict: Dict[str, Any],
+        attn_backend: Optional["AttentionBackend"] = None,
+    ) -> "ModelInputForHPUWithSamplingMetadata":
+        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
+        # FIXME(kzawora): this fails for whatever reason - why?
+        if attn_backend is not None:
+            tensor_dict = _init_attn_metadata_from_tensor_dict(
+                attn_backend, tensor_dict)
+        return cls(**tensor_dict)
+
+
+class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
+    """
+    Helper class for shared methods between GPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForHPU]
 
     def __init__(
         self,
@@ -382,7 +482,7 @@ def _setup_buckets(self) -> None:
         logger.info(msg)
         self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
                                              self.decode_seq_bucket_cfg)
-        msg = ("Generated {len(self.decode_buckets)} decode buckets: "
+        msg = (f"Generated {len(self.decode_buckets)} decode buckets: "
                f"{list(sorted(self.decode_buckets))}")
         logger.info(msg)
 
@@ -698,171 +798,154 @@ def _prepare_decode(
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
-        if self.is_driver_worker:
-            prefill_reqs = []
-            decode_reqs = []
-            for seq_group_meta in seq_group_metadata_list:
-                if seq_group_meta.is_prompt:
-                    prefill_reqs.append(seq_group_meta)
-                else:
-                    decode_reqs.append(seq_group_meta)
-
-            # Prepare input tensors.
-            (
-                input_tokens,
-                input_positions,
-                prefill_attn_metadata,
-                seq_lens,
-                query_lens,
+    ) -> TModelInputForHPU:
+        if len(seq_group_metadata_list) == 0:
+            return self._model_input_cls()
+
+        input_tokens = None
+        input_positions = None
+        lora_mapping = None
+        lora_requests = None
+        multi_modal_input = None
+        batch_type = None
+        seq_lens = None
+        query_lens = None
+        real_batch_size = None
+        batch_size_padded = None
+
+        self.event_start = self.profiler.get_timestamp_us()
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        base_event_name = 'prompt' if is_prompt else 'decode'
+        self.profiler.start('internal', base_event_name)
+
+        real_batch_size = len(seq_group_metadata_list)
+        bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \
+            self.decode_bs_bucket_cfg
+        batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
+        batch_size_padding = batch_size_padded - real_batch_size
+        seq_group_metadata_list = seq_group_metadata_list.copy()
+        seq_group_metadata_list.extend(seq_group_metadata_list[0]
+                                       for _ in range(batch_size_padding))
+
+        prefill_reqs = []
+        decode_reqs = []
+        for seq_group_meta in seq_group_metadata_list:
+            if seq_group_meta.is_prompt:
+                prefill_reqs.append(seq_group_meta)
+            else:
+                decode_reqs.append(seq_group_meta)
+
+        # Prepare input tensors.
+        (
+            input_tokens,
+            input_positions,
+            prefill_attn_metadata,
+            seq_lens,
+            query_lens,
+            lora_index_mapping,
+            lora_prompt_mapping,
+            lora_requests,
+            multi_modal_input,
+            slot_mapping,
+        ) = self._prepare_prompt(prefill_reqs)
+        (
+            decode_input_tokens,
+            decode_input_positions,
+            decode_attn_metadata,
+            decode_lora_index_mapping,
+            decode_lora_prompt_mapping,
+            decode_lora_requests,
+            decode_slot_mapping,
+        ) = self._prepare_decode(decode_reqs)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     seq_lens, query_lens,
+                                                     self.device,
+                                                     self.pin_memory)
+
+        if not self.scheduler_config.chunked_prefill_enabled:
+            assert (len(prefill_reqs) and len(decode_reqs)) == 0
+
+        num_prefills = len(seq_lens)
+        num_prefill_tokens = len(input_tokens)
+        num_decode_tokens = len(decode_input_tokens)
+
+        # NOTE(kzawora): Here we diverge from GPU code - we don't
+        # support mixed batches, so we either use decode or prefill
+        # inputs, without coalescing.
+        assert (num_prefills == 0 and num_decode_tokens > 0) or (
+            num_prefills > 0
+            and num_decode_tokens == 0), "HPU does not support mixed batches!"
+        if num_decode_tokens > 0:
+            input_tokens = decode_input_tokens
+            input_positions = decode_input_positions
+            slot_mapping = decode_slot_mapping
+            lora_index_mapping = decode_lora_index_mapping
+            lora_prompt_mapping = decode_lora_prompt_mapping
+            lora_requests = decode_lora_requests
+
+        # FIXME: We need to adjust selected_token_indices to accommodate
+        # for padding
+        max_len = input_tokens.size(1)
+        paddings = [max_len - s for s in seq_lens]
+        paddings = [0] + paddings[:-1]
+        paddings = list(itertools.accumulate(paddings))
+        paddings = torch.tensor(
+            paddings,
+            dtype=sampling_metadata.selected_token_indices.dtype,
+            device=sampling_metadata.selected_token_indices.device)
+        sampling_metadata.selected_token_indices.add_(paddings)
+
+        if self.lora_config:
+            lora_mapping = LoRAMapping(
                 lora_index_mapping,
                 lora_prompt_mapping,
-                lora_requests,
-                multi_modal_input,
-                slot_mapping,
-            ) = self._prepare_prompt(prefill_reqs)
-            (
-                decode_input_tokens,
-                decode_input_positions,
-                decode_attn_metadata,
-                decode_lora_index_mapping,
-                decode_lora_prompt_mapping,
-                decode_lora_requests,
-                decode_slot_mapping,
-            ) = self._prepare_decode(decode_reqs)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, seq_lens, query_lens, self.device,
-                self.pin_memory)
-
-            if not self.scheduler_config.chunked_prefill_enabled:
-                assert (len(prefill_reqs) and len(decode_reqs)) == 0
-
-            num_prefills = len(seq_lens)
-            num_prefill_tokens = len(input_tokens)
-            num_decode_tokens = len(decode_input_tokens)
-
-            # NOTE(kzawora): Here we diverge from GPU code - we don't
-            # support mixed batches, so we either use decode or prefill
-            # inputs, without coalescing.
-            assert (num_prefills == 0 and num_decode_tokens > 0) or (
-                num_prefills > 0 and num_decode_tokens
-                == 0), "HPU does not support mixed batches!"
-            if num_decode_tokens > 0:
-                input_tokens = decode_input_tokens
-                input_positions = decode_input_positions
-                slot_mapping = decode_slot_mapping
-                lora_index_mapping = decode_lora_index_mapping
-                lora_prompt_mapping = decode_lora_prompt_mapping
-                lora_requests = decode_lora_requests
-
-            # FIXME: We need to adjust selected_token_indices to accommodate
-            # for padding
-            max_len = input_tokens.size(1)
-            paddings = [max_len - s for s in seq_lens]
-            paddings = [0] + paddings[:-1]
-            paddings = list(itertools.accumulate(paddings))
-            paddings = torch.tensor(
-                paddings,
-                dtype=sampling_metadata.selected_token_indices.dtype,
-                device=sampling_metadata.selected_token_indices.device)
-            sampling_metadata.selected_token_indices.add_(paddings)
-
-            if self.lora_config:
-                lora_mapping = LoRAMapping(
-                    lora_index_mapping,
-                    lora_prompt_mapping,
-                )
-            else:
-                lora_mapping = None
-
-            if (prefill_attn_metadata is not None
-                    and decode_attn_metadata is not None):
-                batch_type = BatchType.MIXED
-                raise NotImplementedError(
-                    "Mixed batch is not supported on HPU")
-            elif prefill_attn_metadata is not None:
-                batch_type = BatchType.PREFILL
-            else:
-                batch_type = BatchType.DECODE
-
-            metadata_dict = {
-                "input_tokens": input_tokens,
-                "input_positions": input_positions,
-                "selected_token_indices":
-                sampling_metadata.selected_token_indices,
-                "lora_requests": lora_requests,
-                "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
-                "num_prefill_tokens": num_prefill_tokens,
-                "num_decode_tokens": num_decode_tokens,
-                "slot_mapping": slot_mapping,
-                "num_prefills": num_prefills,
-                "batch_type": batch_type,
-            }
-            if prefill_attn_metadata is not None:
-                metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
-            else:
-                assert decode_attn_metadata is not None
-                metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
-            broadcast_tensor_dict(metadata_dict, src=0)
-
-            # Broadcast decode attn metadata for mixed batch type.
-            # The additional broadcast costs 300us overhead on 4 A10 GPUs.
-            # We can potentially reduce the overhead by coelescing tensors.
-            if batch_type == BatchType.MIXED:
-                assert decode_attn_metadata is not None
-                metadata_dict = decode_attn_metadata.asdict_zerocopy()
-                broadcast_tensor_dict(metadata_dict, src=0)
-        else:
-            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict.pop("input_tokens")
-            input_positions = metadata_dict.pop("input_positions")
-            selected_token_indices = metadata_dict.pop(
-                "selected_token_indices")
-            lora_mapping = metadata_dict.pop("lora_mapping")
-            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
-            batch_type = metadata_dict.pop("batch_type")
-
-            # Create an attention metadata.
-            prefill_attn_metadata = None
-            decode_attn_metadata = None
-            if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED:
-                prefill_attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            else:
-                decode_attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
-            sampling_metadata = SamplingMetadata(
-                seq_groups=None,
-                selected_token_indices=selected_token_indices,
-                categorized_sample_indices=None,
-                num_prompts=0,
             )
-
-            # if it is a mixed batch, decode attn_metadata is broadcasted
-            # separately.
-            if batch_type == BatchType.MIXED:
-                metadata_dict = broadcast_tensor_dict(src=0)
-                decode_attn_metadata = self.attn_backend.make_metadata(
-                    **metadata_dict)
+        else:
+            lora_mapping = None
+
+        if (prefill_attn_metadata is not None
+                and decode_attn_metadata is not None):
+            batch_type = BatchType.MIXED
+            raise NotImplementedError("Mixed batch is not supported on HPU")
+        elif prefill_attn_metadata is not None:
+            batch_type = BatchType.PREFILL
+        else:
+            batch_type = BatchType.DECODE
+
+        metadata_dict = {
+            "input_tokens": input_tokens,
+            "input_positions": input_positions,
+            "selected_token_indices": sampling_metadata.selected_token_indices,
+            "lora_requests": lora_requests,
+            "lora_mapping": lora_mapping,
+            "multi_modal_input": multi_modal_input,
+            "num_prefill_tokens": num_prefill_tokens,
+            "num_decode_tokens": num_decode_tokens,
+            "slot_mapping": slot_mapping,
+            "num_prefills": num_prefills,
+            "batch_type": batch_type,
+            "seq_lens": seq_lens,
+            "query_lens": query_lens
+        }
+        if prefill_attn_metadata is not None:
+            metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
+        else:
+            assert decode_attn_metadata is not None
+            metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
 
         attn_metadata = prefill_attn_metadata if \
             prefill_attn_metadata is not None else decode_attn_metadata
-        #        attn_metadata = AttentionMetadata(
-        #            num_prefills=num_prefills,
-        #            slot_mapping=slot_mapping,
-        #            num_prefill_tokens=num_prefill_tokens,
-        #            num_decode_tokens=num_decode_tokens,
-        #            prefill_metadata=prefill_attn_metadata,
-        #            decode_metadata=decode_attn_metadata,
-        #            kv_cache_dtype=self.kv_cache_dtype,
-        #        )
-
-        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_input)
+
+        return self._model_input_cls(input_tokens=input_tokens,
+                                     seq_lens=seq_lens,
+                                     query_lens=query_lens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     lora_requests=lora_requests,
+                                     lora_mapping=lora_mapping,
+                                     multi_modal_kwargs=multi_modal_input,
+                                     real_batch_size=real_batch_size,
+                                     batch_size_padded=batch_size_padded)
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -897,110 +980,6 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         ])
         return attention_metadata
 
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[torch.Tensor],
-    ) -> Optional[SamplerOutput]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            event_start = self.profiler.get_timestamp_us()
-            is_prompt = seq_group_metadata_list[0].is_prompt
-            base_event_name = 'prompt' if is_prompt else 'decode'
-            self.profiler.start('internal', base_event_name)
-
-            real_batch_size = len(seq_group_metadata_list)
-            bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \
-                self.decode_bs_bucket_cfg
-            batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
-            batch_size_padding = batch_size_padded - real_batch_size
-            seq_group_metadata_list = seq_group_metadata_list.copy()
-            seq_group_metadata_list.extend(seq_group_metadata_list[0]
-                                           for _ in range(batch_size_padding))
-        with self.profiler.record_event('internal', 'prepare_input_tensors'):
-            assert seq_group_metadata_list is not None
-            (input_tokens, input_positions, attn_metadata, sampling_metadata,
-             lora_requests, lora_mapping, multi_modal_input
-             ) = self.prepare_input_tensors(seq_group_metadata_list)
-            is_prompt = attn_metadata.is_prompt
-
-        # NOTE(kzawora): Need to restore this after adding LoRA
-        # if self.lora_config:
-        #    self.set_active_loras(lora_requests, lora_mapping)
-
-        batch_size = input_tokens.size(0)
-        seq_len = self._seq_len(attn_metadata)
-        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": self.trim_attn_metadata(attn_metadata),
-        }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
-
-        htorch.core.mark_step()
-        if self.is_driver_worker:
-            model_event_name = ("model_"
-                                f"{'prompt' if is_prompt else 'decode'}_"
-                                f"bs{batch_size}_"
-                                f"seq{seq_len}_"
-                                f"graphs{'T' if use_graphs else 'F'}")
-        else:
-            model_event_name = 'model_executable'
-        with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model.forward(
-                **execute_model_kwargs,
-                selected_token_indices=sampling_metadata.
-                selected_token_indices,
-                bypass_hpu_graphs=not use_graphs)
-
-        # Compute the logits.
-        with self.profiler.record_event(
-                'internal', ('compute_logits_'
-                             f'{"prompt" if is_prompt else "decode"}_bs'
-                             f'{batch_size}_'
-                             f'seq{seq_len}')):
-            sampling_metadata.selected_token_indices = None
-            logits = self.model.compute_logits(hidden_states,
-                                               sampling_metadata)
-        htorch.core.mark_step()
-
-        # Only perform sampling in the driver worker.
-        if not self.is_driver_worker:
-            return None
-
-        # Sample the next token.
-        with self.profiler.record_event(
-                'internal', ('sample_'
-                             f'{"prompt" if is_prompt else "decode"}_'
-                             f'bs{batch_size}_'
-                             f'seq{seq_len}')):
-            output = self.model.sample(
-                logits=logits,
-                sampling_metadata=sampling_metadata,
-            )
-        output.outputs = output.outputs[:real_batch_size]
-        htorch.core.mark_step()
-
-        if self.is_driver_worker and self.profiler.enabled:
-            # Stop recording 'execute_model' event
-            self.profiler.end()
-            event_end = self.profiler.get_timestamp_us()
-            counters = self.profiler_counter_helper.get_counter_dict(
-                cache_config=self.cache_config,
-                duration=event_end - event_start,
-                seq_len=seq_len,
-                batch_size_padded=batch_size_padded,
-                real_batch_size=real_batch_size,
-                seq_group_metadata_list=seq_group_metadata_list,
-                is_prompt=is_prompt)
-            self.profiler.record_counter(event_start, counters)
-
-        return output
-
     def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
         sampling_params = SamplingParams(temperature=0)
         num_blocks = math.ceil(seq_len / self.block_size)
@@ -1048,7 +1027,8 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
         ]
         torch.hpu.synchronize()
         for _ in range(times):
-            self.execute_model(seqs, kv_caches)
+            inputs = self.prepare_model_input(seqs)
+            self.execute_model(inputs, kv_caches)
             torch.hpu.synchronize()
         self.profiler.end()
         gc.collect()
@@ -1163,19 +1143,28 @@ def __init__(self):
         self.niter = 0
         self.average_real_throughput = None
         self.logged_once = False
+        self.real_seq_lens = []
+        self.prompt_seq_lens = []
 
-    def get_counter_dict(self, cache_config, duration, seq_len,
-                         batch_size_padded, real_batch_size,
-                         seq_group_metadata_list, is_prompt):
-        throughput = batch_size_padded / (duration / 1e6)
-        throughput_effective = real_batch_size / (duration / 1e6)
-        real_seq_lens = [
+    def capture_seq_group_metadata_stats(self, seq_group_metadata_list):
+        self.real_seq_lens = [
             len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids)
             for seq_group_metadata in seq_group_metadata_list
             for seq_data in seq_group_metadata.seq_data.values()
         ]
-        real_max_seq_len = max(real_seq_lens)
-        real_num_tokens = sum(real_seq_lens)
+        self.prompt_seq_lens = [
+            len(seq_data.prompt_token_ids)
+            for seq_group_metadata in seq_group_metadata_list
+            for seq_data in seq_group_metadata.seq_data.values()
+        ]
+
+    def get_counter_dict(self, cache_config, duration, seq_len,
+                         batch_size_padded, real_batch_size, is_prompt):
+        throughput = batch_size_padded / (duration / 1e6)
+        throughput_effective = real_batch_size / (duration / 1e6)
+
+        real_max_seq_len = max(self.real_seq_lens)
+        real_num_tokens = sum(self.real_seq_lens)
         padded_num_tokens = batch_size_padded * seq_len
         batch_token_utilization = real_num_tokens / padded_num_tokens
         if self.average_real_throughput is None:
@@ -1198,14 +1187,10 @@ def get_counter_dict(self, cache_config, duration, seq_len,
         }
         self.niter += 1
         if is_prompt:
-            prompt_seq_lens = [
-                len(seq_data.prompt_token_ids)
-                for seq_group_metadata in seq_group_metadata_list
-                for seq_data in seq_group_metadata.seq_data.values()
-            ]
             prompt_bucket_in_throughput = (seq_len * batch_size_padded) / (
                 duration / 1e6)
-            prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6)
+            prompt_real_in_throughput = sum(
+                self.prompt_seq_lens) / (duration / 1e6)
             counters[
                 f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput
             counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput
@@ -1214,7 +1199,8 @@ def get_counter_dict(self, cache_config, duration, seq_len,
         if cache_config.num_gpu_blocks is not None and \
             cache_config.num_gpu_blocks != 0:
             cache_num_blocks_used = [
-                math.ceil(sl / cache_config.block_size) for sl in real_seq_lens
+                math.ceil(sl / cache_config.block_size)
+                for sl in self.real_seq_lens
             ]
             cache_total_num_blocks_used = sum(cache_num_blocks_used)
             num_cache_blocks = cache_config.num_gpu_blocks
@@ -1238,3 +1224,151 @@ def get_counter_dict(self, cache_config, duration, seq_len,
             counters['const_block_size'] = cache_config.block_size
             self.logged_once = True
         return counters
+
+
+class HabanaModelRunner(
+        HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
+    """
+    GPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = (
+        ModelInputForHPUWithSamplingMetadata)
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForHPUWithSamplingMetadata:
+        return (
+            ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            ))
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForHPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        with self.profiler.record_event('internal', 'prepare_input_tensors'):
+            assert seq_group_metadata_list is not None
+            self.profiler_counter_helper.capture_seq_group_metadata_stats(
+                seq_group_metadata_list=seq_group_metadata_list)
+            model_input = self.prepare_input_tensors(seq_group_metadata_list)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list, model_input.seq_lens,
+                model_input.query_lens, self.device, self.pin_memory)
+            assert model_input.attn_metadata is not None
+            is_prompt = model_input.attn_metadata.is_prompt
+
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForHPUWithSamplingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError(
+                "num_steps > 1 is not supported in HabanaModelRunner")
+
+        # NOTE(kzawora): Need to restore this after adding LoRA
+        # if self.lora_config:
+        #    self.set_active_loras(lora_requests, lora_mapping)
+        input_tokens = model_input.input_tokens
+        input_positions = model_input.input_positions
+        attn_metadata = model_input.attn_metadata
+        sampling_metadata = model_input.sampling_metadata
+        multi_modal_input = model_input.multi_modal_kwargs
+        real_batch_size = model_input.real_batch_size
+        batch_size_padded = model_input.batch_size_padded
+        is_prompt = model_input.is_prompt
+        assert input_tokens is not None
+        assert input_positions is not None
+        assert attn_metadata is not None
+        assert is_prompt is not None
+        assert sampling_metadata is not None
+        batch_size = input_tokens.size(0)
+        seq_len = self._seq_len(attn_metadata)
+        use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": self.trim_attn_metadata(attn_metadata),
+            "intermediate_tensors": intermediate_tensors
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        htorch.core.mark_step()
+        if self.is_driver_worker:
+            model_event_name = ("model_"
+                                f"{'prompt' if is_prompt else 'decode'}_"
+                                f"bs{batch_size}_"
+                                f"seq{seq_len}_"
+                                f"graphs{'T' if use_graphs else 'F'}")
+        else:
+            model_event_name = 'model_executable'
+        with self.profiler.record_event('internal', model_event_name):
+            hidden_states = self.model.forward(
+                **execute_model_kwargs,
+                selected_token_indices=sampling_metadata.
+                selected_token_indices,
+                bypass_hpu_graphs=not use_graphs)
+
+        # Compute the logits.
+        with self.profiler.record_event(
+                'internal', ('compute_logits_'
+                             f'{"prompt" if is_prompt else "decode"}_bs'
+                             f'{batch_size}_'
+                             f'seq{seq_len}')):
+            sampling_metadata.selected_token_indices = None
+            logits = self.model.compute_logits(hidden_states,
+                                               sampling_metadata)
+        htorch.core.mark_step()
+        # Only perform sampling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
+        # Sample the next token.
+        with self.profiler.record_event(
+                'internal', ('sample_'
+                             f'{"prompt" if is_prompt else "decode"}_'
+                             f'bs{batch_size}_'
+                             f'seq{seq_len}')):
+            output = self.model.sample(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
+        output.outputs = output.outputs[:real_batch_size]
+        htorch.core.mark_step()
+
+        if self.is_driver_worker and self.profiler.enabled:
+            # Stop recording 'execute_model' event
+            self.profiler.end()
+            event_end = self.profiler.get_timestamp_us()
+            counters = self.profiler_counter_helper.get_counter_dict(
+                cache_config=self.cache_config,
+                duration=event_end - self.event_start,
+                seq_len=seq_len,
+                batch_size_padded=batch_size_padded,
+                real_batch_size=real_batch_size,
+                is_prompt=is_prompt)
+            self.profiler.record_counter(self.event_start, counters)
+        return [output]
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 63055bf4f2055..f91d6bc5cefa9 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -4,27 +4,26 @@
 
 import gc
 import os
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple
 
-import habana_frameworks.torch as htorch
+import habana_frameworks.torch as htorch  # noqa:F401
 import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          SpeculativeConfig, VisionLanguageConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
+from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
-from vllm.worker.worker_base import WorkerBase
+from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 
-class HabanaWorker(WorkerBase):
+class HabanaWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a HPU.
 
     Each worker is associated with a single HPU. The worker is responsible for
@@ -72,20 +71,21 @@ def __init__(
                 "To be tested: vision language model with LoRA settings.")
             raise AssertionError("To be tested: vision language model on HPU")
 
-        self.model_runner = HabanaModelRunner(
+        self.model_runner: HabanaModelRunner = HabanaModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
             device_config,
-            load_config=load_config,
             cache_config=cache_config,
+            load_config=load_config,
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: CacheEngine
-        self.hpu_cache: List[torch.Tensor]
+        self.cache_engine: List[CacheEngine]
+        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        self.hpu_cache: Optional[List[List[torch.tensor]]] = None
 
     def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
@@ -164,112 +164,78 @@ def initialize_cache(self, num_gpu_blocks: int,
         self._init_cache_engine()
         self._warm_up_model()
 
-    def _init_cache_engine(self) -> None:
+    def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config,
-                                        self.device_config)
-        self.hpu_cache = self.cache_engine.gpu_cache
-        # we want to materialize cache tensors before we proceed with
-        # graph capture/execution
-        htorch.hpu.synchronize()
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.hpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
 
     def _warm_up_model(self) -> None:
-        self.model_runner.warmup_model(self.hpu_cache)
+        # NOTE(kzawora): We should use virtual engine index here
+        # for pipeline parallelism. Using 0 for now.
+        assert self.hpu_cache is not None
+        self.model_runner.warmup_model(self.hpu_cache[0])
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
-    def cache_swap(
-        self,
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: torch.Tensor,
-    ) -> None:
-        # Issue cache operations.
-        # TODO(woosuk): Profile swapping overhead and optimize if needed.
-        if blocks_to_swap_in:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-        if blocks_to_swap_out:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-        if blocks_to_copy.numel() > 0:
-            self.cache_engine.copy(blocks_to_copy)
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
 
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        if execute_model_req is None:
-            seq_group_metadata_list = None
-        else:
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.hpu_cache
 
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            assert execute_model_req is not None
-            num_seq_groups = len(seq_group_metadata_list)
-            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
-            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
-            blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                          device=self.device,
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
                                           dtype=torch.int64).view(-1, 2)
-            data: Dict[str, Any] = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_swap_in": blocks_to_swap_in,
-                "blocks_to_swap_out": blocks_to_swap_out,
-                "blocks_to_copy": blocks_to_copy,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_swap_in = data["blocks_to_swap_in"]
-            blocks_to_swap_out = data["blocks_to_swap_out"]
-            blocks_to_copy = data["blocks_to_copy"]
-
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.hpu_cache)
-        return [output]
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+        )
 
     @torch.inference_mode()
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        while self._execute_model_non_driver():
-            pass
-
-    def _execute_model_non_driver(self) -> bool:
-        """Execute model in parallel worker.
-
-        Returns True iff there are remaining sequences to process.
-        """
-        assert not self.is_driver_worker
-        data = broadcast_tensor_dict(src=0)
-        if not data:
-            return False
-
-        num_seq_groups = data.get("num_seq_groups", 0)
-        blocks_to_swap_in = data.get("blocks_to_swap_in")
-        blocks_to_swap_out = data.get("blocks_to_swap_out")
-        blocks_to_copy = data.get("blocks_to_copy")
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return False
-
-        self.model_runner.execute_model(None, self.hpu_cache)
-        return True
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index bc0960fa16221..4990511789e11 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -1,7 +1,7 @@
 import dataclasses
 from abc import ABC, abstractmethod
 from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
-                    TypeVar)
+                    TypeVar, Union, get_args, get_origin)
 
 import torch
 
@@ -39,9 +39,14 @@ def _init_attn_metadata_from_tensor_dict(
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
         val = tensor_dict.pop(field.name, None)
-        if val is not None:
+        # NOTE(kzawora): None is a valid value if type is optional. If
+        # we don't check against it, we will crash by not assigning
+        # Optional types without default value, even if they are
+        # broadcasted properly.
+        is_field_optional = get_origin(field.type) is Union and \
+            type(None) in get_args(field.type)
+        if val is not None or (val is None and is_field_optional):
             valid_attn_kwargs[field.name] = val
-
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata
     return tensor_dict

From bca41a146ba511a463811104e55f535755eaeac7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 3 Jul 2024 19:24:55 +0300
Subject: [PATCH 082/341] fix is_prompt for mixtral

---
 vllm/worker/habana_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index f3e2e976c1c5d..6669c00a49647 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1297,12 +1297,12 @@ def execute_model(
         multi_modal_input = model_input.multi_modal_kwargs
         real_batch_size = model_input.real_batch_size
         batch_size_padded = model_input.batch_size_padded
-        is_prompt = model_input.is_prompt
         assert input_tokens is not None
         assert input_positions is not None
+        assert sampling_metadata is not None
         assert attn_metadata is not None
+        is_prompt = attn_metadata.is_prompt
         assert is_prompt is not None
-        assert sampling_metadata is not None
         batch_size = input_tokens.size(0)
         seq_len = self._seq_len(attn_metadata)
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)

From 717c0ce6b96369bf0526f3b98cf5f7fc662e5b0b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 14:10:19 +0300
Subject: [PATCH 083/341] restore HPU autodetection

---
 setup.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/setup.py b/setup.py
index 897958d875284..de37558d738c4 100644
--- a/setup.py
+++ b/setup.py
@@ -206,10 +206,6 @@ def build_extensions(self) -> None:
 
 
 def _is_hpu() -> bool:
-    is_hpu_available = True
-    # FIXME(kzawora): HPU autodetection sporadically fails on certain clients.
-    # Need to find the cause and fix it.
-    return is_hpu_available
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):

From 8a4c5c19469edcc7c0b4b51b43b4a103ad93a287 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= <michal.kuligowski@intel.com>
Date: Thu, 4 Jul 2024 13:14:42 +0200
Subject: [PATCH 084/341] SiLU memory leak in fwd

---
 vllm/model_executor/layers/activation.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index b2641cf89bdc5..5bfdba67b443d 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,15 +37,6 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.silu_and_mul(out, x)
         return out
 
-    def forward_hpu(self, x: torch.Tensor) -> torch.Tensor:
-        import vllm.hpu.ops as ops
-
-        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.silu_and_mul(out, x)
-        return out
-
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 

From c5cd04aabbda514b8690f2bef34793868baae393 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 14:17:05 +0300
Subject: [PATCH 085/341] add WA for model loader

---
 .../layers/vocab_parallel_embedding.py              | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index d70eb1c2704b4..e1212ab8b6376 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-
+from vllm.utils import is_hpu
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
 
@@ -327,8 +327,15 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
         # Copy the data.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
-        param[loaded_weight.shape[0]:].data.fill_(0)
+
+        # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so 
+        # we're using a workaround. Remove this when fixed in HPU PT bridge.
+        if is_hpu():
+            padded_weight = torch.cat([loaded_weight, torch.zeros(param.shape[0] - loaded_weight.shape[0], *loaded_weight.shape[1:])])
+            param.data.copy_(padded_weight)
+        else:
+            param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
+            param[loaded_weight.shape[0]:].data.fill_(0)
 
     def forward(self, input_):
         if self.tp_size > 1:

From 9efb594ed5a0af4a71dd4ed5727aeff733743dcd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 14:22:50 +0300
Subject: [PATCH 086/341] remove hpu model loader WA

---
 vllm/model_executor/model_loader/loader.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2bdb7438f6eaf..96a29cabe87e1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -263,9 +263,7 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            load_device = torch.device(device_config.device) if not is_hpu(
-            ) else 'cpu'  # FIXME(kzawora): this is a nasty workaround!!!
-            with torch.device(load_device):
+            with torch.device(torch.device(device_config.device)):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config)
@@ -285,9 +283,6 @@ def load_model(self, *, model_config: ModelConfig,
                 # to use quant_method.
                 if hasattr(module, "process_weights_after_loading"):
                     module.process_weights_after_loading()
-        if is_hpu():
-            model = model.to(
-                'hpu')  # FIXME(kzawora): this is a nasty workaround!!!
         return model.eval()
 
 
From def464e3d70a5554c12758995ae7c57dc02065f6 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 14:43:01 +0300
Subject: [PATCH 087/341] fix hpu autodetection (again)

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index de37558d738c4..bc71888fc933a 100644
--- a/setup.py
+++ b/setup.py
@@ -206,6 +206,7 @@ def build_extensions(self) -> None:
 
 
 def _is_hpu() -> bool:
+    is_hpu_available = True
     try:
         subprocess.run(["hl-smi"], capture_output=True, check=True)
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):

From 30f36f03ca269b5b1bb35266171797bfe8b0229b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 14:43:37 +0300
Subject: [PATCH 088/341] fix VLM configs in hpu components

---
 vllm/executor/habana_executor.py   |  2 +-
 vllm/worker/habana_model_runner.py | 14 +++++++-------
 vllm/worker/habana_worker.py       | 14 ++++++--------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index b771b9e026970..012872c72d974 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -43,7 +43,7 @@ def _get_worker_kwargs(
             rank=rank,
             distributed_init_method=distributed_init_method,
             lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
             is_driver_worker=rank == 0,
         )
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6669c00a49647..8a698306fe2c5 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -20,7 +20,7 @@
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
-                         VisionLanguageConfig)
+                         MultiModalConfig)
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -344,7 +344,7 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -370,7 +370,7 @@ def __init__(
 
         self.pin_memory = is_pin_memory_available()
         self.kv_cache_dtype = kv_cache_dtype
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_num_attention_heads(self.parallel_config),
@@ -399,7 +399,7 @@ def load_model(self) -> None:
                     device_config=self.device_config,
                     load_config=self.load_config,
                     lora_config=self.lora_config,
-                    vision_language_config=self.vision_language_config,
+                    multimodal_config=self.multimodal_config,
                     parallel_config=self.parallel_config,
                     scheduler_config=self.scheduler_config,
                     cache_config=self.cache_config)
@@ -615,7 +615,7 @@ def _prepare_prompt(
                                            device=self.device)
 
         if multi_modal_input_list:
-            assert self.vision_language_config, (
+            assert self.multimodal_config, (
                 "Multi-modal inputs are only supported by "
                 "vision language models.")
             multi_modal_input = torch.cat(multi_modal_input_list,
@@ -1313,8 +1313,8 @@ def execute_model(
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
             "intermediate_tensors": intermediate_tensors
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+        if multi_modal_input is not None:
+            execute_model_kwargs.update(multi_modal_input)
 
         htorch.core.mark_step()
         if self.is_driver_worker:
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index f91d6bc5cefa9..bde037b990a96 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -12,7 +12,7 @@
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         SpeculativeConfig, MultiModalConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.lora.request import LoRARequest
@@ -43,7 +43,7 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        vision_language_config: Optional[VisionLanguageConfig] = None,
+        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
@@ -65,11 +65,7 @@ def __init__(
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.vision_language_config = vision_language_config
-        if self.vision_language_config:
-            assert not self.lora_config, (
-                "To be tested: vision language model with LoRA settings.")
-            raise AssertionError("To be tested: vision language model on HPU")
+        self.multimodal_config = multimodal_config
 
         self.model_runner: HabanaModelRunner = HabanaModelRunner(
             model_config,
@@ -80,7 +76,9 @@ def __init__(
             load_config=load_config,
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker)
+            multimodal_config=self.multimodal_config,
+            is_driver_worker=is_driver_worker
+        )
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]

From 1dd85025015f5433568d6b3c0525fe629b14f37d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 18:42:40 +0300
Subject: [PATCH 089/341] fix  hpu autodetection

---
 setup.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index bc71888fc933a..8c585cc822d86 100644
--- a/setup.py
+++ b/setup.py
@@ -212,7 +212,14 @@ def _is_hpu() -> bool:
     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
         if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
                 '/dev/accel/accel_controlD0'):
-            is_hpu_available = False
+            # last resort...
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                is_hpu_available = False
     return is_hpu_available
 
 
From 0836502ee679101ddf250a4e2069eff141756184 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 18:03:47 +0300
Subject: [PATCH 090/341] Remove invasive ALiBi changes

---
 vllm/attention/backends/abstract.py    | 1 -
 vllm/attention/backends/habana_attn.py | 2 ++
 vllm/attention/layer.py                | 3 +--
 vllm/model_executor/models/mpt.py      | 3 +--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 55d9a43b35652..40768532f59c2 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -116,7 +116,6 @@ def __init__(
         sliding_window: Optional[int] = None,
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len: Optional[int] = 4096,
     ) -> None:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 98c16fdca4c3f..6b1695ba3fd52 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -146,6 +146,8 @@ def __init__(
         self.position_bias = None
         self.alibi_slopes = alibi_slopes
         if alibi_slopes is not None:
+            # FIXME(kzawora): Need a general method to set max_seq_len on
+            # per-model basis.
             alibi_slopes_tensor = torch.tensor(alibi_slopes,
                                                dtype=torch.bfloat16)
             self.position_bias = _make_alibi_bias(alibi_slopes_tensor,
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 8e796e7a50d59..dfe93be462184 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -34,7 +34,6 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len: Optional[int] = 4096,
     ) -> None:
         super().__init__()
         if cache_config is not None:
@@ -82,7 +81,7 @@ def __init__(
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
                              alibi_slopes, sliding_window, kv_cache_dtype,
-                             blocksparse_params, max_seq_len)
+                             blocksparse_params)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 7a753bf96c9d9..7d658b39e6794 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -111,8 +111,7 @@ def __init__(
                               alibi_slopes=alibi_slopes,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
-                              quant_config=quant_config,
-                              max_seq_len=config.max_seq_len)
+                              quant_config=quant_config)
 
     def forward(
         self,

From a2f361ccd4cc78a4c166a0e8ca4adf414a6ac652 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 4 Jul 2024 18:52:05 +0300
Subject: [PATCH 091/341] add VLLM_TARGET_DEVICE='hpu'

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8c585cc822d86..e0a0d648dc0a6 100644
--- a/setup.py
+++ b/setup.py
@@ -220,7 +220,7 @@ def _is_hpu() -> bool:
             except (ValueError, FileNotFoundError, PermissionError,
                     subprocess.CalledProcessError):
                 is_hpu_available = False
-    return is_hpu_available
+    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
 
 
 def _is_cuda() -> bool:

From 08ba3880d27b3fd6e29847b8ba0de73431c95b3d Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <tzielinski@habana.ai>
Date: Mon, 15 Jul 2024 17:50:58 +0300
Subject: [PATCH 092/341] Added docstring and assertion to warmup_range

---
 vllm/worker/habana_model_runner.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 8a698306fe2c5..15ae8aa8e7b53 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -52,12 +52,14 @@
 _TYPE_CACHE = {}
 
 
-# Read bucketing configuration from env variables
-# phase is either 'prompt' or 'decode'
-# dim is either 'bs' or 'seq'
-# param is either 'min', 'step' or 'max'
-# example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
 def read_bucket_settings(phase: str, dim: str, **defaults):
+    """Read bucketing configuration from env variables.
+
+    phase is either 'prompt' or 'decode'
+    dim is either 'bs' or 'block'
+    param is either 'min', 'step' or 'max'
+    example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
+    """
     params = ['min', 'step', 'max']
     values = [
         int(
@@ -68,7 +70,19 @@ def read_bucket_settings(phase: str, dim: str, **defaults):
 
 
 def warmup_range(config: Tuple[int, int, int]):
+    """Generate a warmup range.
+
+    Start from bmin and multiply by 2 until you reach bstep.
+    Then, increase the values in the range by the value of bstep until you reach bmax.
+
+    Example:
+    bmin = 2, bstep = 32, bmax = 64
+    => ramp_up = (2, 4, 8, 16)
+    => stable = (32, 64)
+    => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
+    """
     bmin, bstep, bmax = config
+    assert bmin <= bmax, "Min. batch size cannot be greater than max. batch size. If you want to skip warmup, set VLLM_SKIP_WARMUP=true"
     base = itertools.repeat(2)
     ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
     ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \

From 6bed24889bd4d94d87d8c0839a917371d67e23fa Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 15 Jul 2024 19:50:16 +0300
Subject: [PATCH 093/341] fix api mismatches

---
 vllm/attention/backends/habana_attn.py             |  3 ++-
 vllm/executor/habana_executor.py                   | 14 ++++++++++++++
 .../layers/vocab_parallel_embedding.py             |  9 +++++++--
 vllm/worker/habana_worker.py                       |  3 +--
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 6b1695ba3fd52..a26b2f42333d0 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -9,7 +9,7 @@
 
 import vllm.hpu.ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata)
+                                              AttentionMetadata, AttentionType)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
 from vllm.logger import init_logger
@@ -172,6 +172,7 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: HabanaAttentionMetadata,
         kv_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
 
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 012872c72d974..8750c3b00dd9e 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -9,6 +9,7 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
                         get_ip, get_open_port, make_async)
@@ -159,6 +160,19 @@ def list_loras(self) -> Set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
     def check_health(self) -> None:
         # GPUExecutor will always be healthy as long as
         # it's running.
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index e1212ab8b6376..7860ec511571b 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -13,6 +13,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import is_hpu
+
 DEFAULT_VOCAB_PADDING_SIZE = 64
 
 
@@ -328,10 +329,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # Copy the data.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
-        # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so 
+        # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so
         # we're using a workaround. Remove this when fixed in HPU PT bridge.
         if is_hpu():
-            padded_weight = torch.cat([loaded_weight, torch.zeros(param.shape[0] - loaded_weight.shape[0], *loaded_weight.shape[1:])])
+            padded_weight = torch.cat([
+                loaded_weight,
+                torch.zeros(param.shape[0] - loaded_weight.shape[0],
+                            *loaded_weight.shape[1:])
+            ])
             param.data.copy_(padded_weight)
         else:
             param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index bde037b990a96..49bf1b6d7d2ef 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -77,8 +77,7 @@ def __init__(
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
             multimodal_config=self.multimodal_config,
-            is_driver_worker=is_driver_worker
-        )
+            is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]

From 03dbee5ba5bc70a3a5aa2ba2190e3586b6f095c5 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 15 Jul 2024 20:02:28 +0300
Subject: [PATCH 094/341] add assert for attn type

---
 vllm/attention/backends/habana_attn.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index a26b2f42333d0..2f620a8e98947 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -185,6 +185,11 @@ def forward(
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HabanaAttentionImpl")
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 

From 8c58a6634c3a5d46950d9e6fd811ebaffea8ac80 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 15 Jul 2024 20:52:07 +0300
Subject: [PATCH 095/341] multi-hpu fixes

---
 vllm/executor/ray_habana_executor.py |  2 +-
 vllm/worker/habana_worker.py         | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 9f57de8d2e060..1e03fbdab32c2 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -49,7 +49,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         if (self.parallel_config.tensor_parallel_size == 1
                 and self.parallel_config.pipeline_parallel_size == 1):
             # For single GPU case, we use a ray worker with constrained memory.
-            num_gpus = self.cache_config.gpu_memory_utilization
+            num_gpus = 1
         else:
             # Otherwise, the ray workers are allocated with a full GPU.
             num_gpus = 1
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 49bf1b6d7d2ef..ad87179a1147d 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -11,12 +11,13 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         ModelConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig, MultiModalConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
@@ -44,7 +45,8 @@ def __init__(
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None, 
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:
         self.model_config = model_config
@@ -246,6 +248,19 @@ def list_loras(self) -> Set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len

From d7afbf2804cd2366208ffcc17259b5ba1e8f90d7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 15 Jul 2024 21:15:52 +0300
Subject: [PATCH 096/341] minor formatting stuff

---
 vllm/executor/ray_habana_executor.py       | 9 +++++++++
 vllm/model_executor/model_loader/loader.py | 2 +-
 vllm/worker/habana_worker.py               | 7 ++++---
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 1e03fbdab32c2..96b08a4dd3895 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -319,6 +319,15 @@ async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
+        if self.pp_locks is None:
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
 
         async def _run_task_with_lock(task, lock, *args, **kwargs):
             async with lock:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 96a29cabe87e1..294dbb91e735d 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -36,7 +36,7 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_hpu, is_tpu
+from vllm.utils import is_tpu
 
 logger = init_logger(__name__)
 
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index ad87179a1147d..6be229e037d06 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -11,8 +11,9 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, MultiModalConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
 from vllm.lora.request import LoRARequest
@@ -45,7 +46,7 @@ def __init__(
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
         multimodal_config: Optional[MultiModalConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None, 
+        speculative_config: Optional[SpeculativeConfig] = None,
         prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
     ) -> None:

From 2b2549ca3de4f230138cf2e6afe391aa3acdc4bc Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 15 Jul 2024 21:16:04 +0300
Subject: [PATCH 097/341] fix sampling metadata for prefill

---
 vllm/worker/habana_model_runner.py | 42 ++++++++++++++++--------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 15ae8aa8e7b53..15ac0035228cc 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -19,8 +19,8 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         MultiModalConfig)
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -73,7 +73,8 @@ def warmup_range(config: Tuple[int, int, int]):
     """Generate a warmup range.
 
     Start from bmin and multiply by 2 until you reach bstep.
-    Then, increase the values in the range by the value of bstep until you reach bmax.
+    Then, increase the values in the range by the value of bstep until you 
+    reach bmax.
 
     Example:
     bmin = 2, bstep = 32, bmax = 64
@@ -82,7 +83,9 @@ def warmup_range(config: Tuple[int, int, int]):
     => return ramp_up + stable => (2, 4, 8, 16, 32, 64)
     """
     bmin, bstep, bmax = config
-    assert bmin <= bmax, "Min. batch size cannot be greater than max. batch size. If you want to skip warmup, set VLLM_SKIP_WARMUP=true"
+    assert bmin <= bmax, ("Min. batch size cannot be greater than max. "
+                          "batch size. If you want to skip warmup, "
+                          "set VLLM_SKIP_WARMUP=true")
     base = itertools.repeat(2)
     ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
     ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
@@ -812,9 +815,9 @@ def _prepare_decode(
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> TModelInputForHPU:
+    ) -> Tuple[TModelInputForHPU, SamplingMetadata]:
         if len(seq_group_metadata_list) == 0:
-            return self._model_input_cls()
+            return self._model_input_cls(), None
 
         input_tokens = None
         input_positions = None
@@ -950,16 +953,17 @@ def prepare_input_tensors(
         attn_metadata = prefill_attn_metadata if \
             prefill_attn_metadata is not None else decode_attn_metadata
 
-        return self._model_input_cls(input_tokens=input_tokens,
-                                     seq_lens=seq_lens,
-                                     query_lens=query_lens,
-                                     input_positions=input_positions,
-                                     attn_metadata=attn_metadata,
-                                     lora_requests=lora_requests,
-                                     lora_mapping=lora_mapping,
-                                     multi_modal_kwargs=multi_modal_input,
-                                     real_batch_size=real_batch_size,
-                                     batch_size_padded=batch_size_padded)
+        return self._model_input_cls(
+            input_tokens=input_tokens,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
+            input_positions=input_positions,
+            attn_metadata=attn_metadata,
+            lora_requests=lora_requests,
+            lora_mapping=lora_mapping,
+            multi_modal_kwargs=multi_modal_input,
+            real_batch_size=real_batch_size,
+            batch_size_padded=batch_size_padded), sampling_metadata
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -1277,10 +1281,8 @@ def prepare_model_input(
             assert seq_group_metadata_list is not None
             self.profiler_counter_helper.capture_seq_group_metadata_stats(
                 seq_group_metadata_list=seq_group_metadata_list)
-            model_input = self.prepare_input_tensors(seq_group_metadata_list)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, model_input.seq_lens,
-                model_input.query_lens, self.device, self.pin_memory)
+            model_input, sampling_metadata = self.prepare_input_tensors(
+                seq_group_metadata_list)
             assert model_input.attn_metadata is not None
             is_prompt = model_input.attn_metadata.is_prompt
 

From e911fd8270e67facd58c36fe0b0414779699b949 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 16 Jul 2024 12:28:46 +0300
Subject: [PATCH 098/341] bump ray version for hpu

---
 requirements-hpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 339fe989bdb7a..e0f03c8464c7b 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for HPU code
-ray == 2.23.0
+ray == 2.32.0
 triton
 pandas
 tabulate

From bf349c58b25e82877e2505d05e75bdbf431240a1 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 18 Jul 2024 15:23:45 +0300
Subject: [PATCH 099/341] split k scale and v scale in habana attn

---
 vllm/attention/backends/habana_attn.py  | 6 ++++--
 vllm/attention/ops/habana_paged_attn.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 2f620a8e98947..33b6e2e538b13 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -171,7 +171,8 @@ def forward(
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: HabanaAttentionMetadata,
-        kv_scale: float = 1.0,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> torch.Tensor:
         """Forward pass with xFormers and PagedAttention.
@@ -253,7 +254,8 @@ def forward(
             output = HabanaPagedAttention.forward_decode(
                 query, key_cache, value_cache, attn_metadata.block_tables,
                 attn_metadata.seq_lens_tensor, self.kv_cache_dtype,
-                self.num_kv_heads, self.scale, self.position_bias, kv_scale)
+                self.num_kv_heads, self.scale, self.position_bias, k_scale,
+                v_scale)
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index ed47b906168e5..7dd701c7a0cdf 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -73,7 +73,8 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale: float,
+        k_scale: float,
+        v_scale: float,
     ) -> torch.Tensor:
         block_size = value_cache.shape[1]
         return ops.paged_attention_v1(

From 8e231a58fcc89a1b82254469ad39b265a50374bd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 15:58:03 +0200
Subject: [PATCH 100/341] Add workaround for RuntimeError: Invalid inputs for
 scatter_nd_onnx (#107)

---
 vllm/hpu/cache_ops.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index d28a47271c6ac..14824945aa53a 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -16,12 +16,31 @@ def reshape_and_cache(key,
                       slot_mapping,
                       dtype,
                       is_prompt=False):
+    num_blocks = key_cache.size(0)
     block_size = key_cache.size(1)
     slot_mapping = slot_mapping.flatten()
     indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     offsets = torch.fmod(slot_mapping, block_size)
-    key_cache.index_put_((indices, offsets), key)
-    value_cache.index_put_((indices, offsets), value)
+    num_slots_requested = slot_mapping.size(0)
+    num_slots_available = num_blocks * block_size
+    # NOTE(kzawora): HPU PT bridge crashes with
+    # RuntimeError: Invalid inputs for scatter_nd_onnx
+    # on index_put when num_slots_requested > num_slots_available.
+    # This case might occur when we have little kv cache blocks and
+    # lots of padding, or are doing warmup.
+    # This loop is a workaround for this issue. Please remove it
+    # once key_cache.index_put_(indices, offsets), key) works.
+    num_kv_cache_passes = torch.div(num_slots_requested,
+                                    num_slots_available).ceil().int().item()
+    for i in range(num_kv_cache_passes):
+        start_idx = i * num_slots_available
+        end_idx = (i + 1) * num_slots_available
+        key_cache.index_put_(
+            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
+            key[start_idx:end_idx])
+        value_cache.index_put_(
+            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
+            value[start_idx:end_idx])
 
 
 def swap_blocks(src, dst, block_mapping):

From f7dc5545dc994c0d3a37a9c4eb33190f6ac45018 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 15:58:15 +0200
Subject: [PATCH 101/341] Refactor forward_hpu of RMSNorm (#128)

---
 vllm/model_executor/layers/layernorm.py | 52 +++++++++----------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 67cef1b47f3bf..f1b7a73d22d52 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,13 +6,15 @@
 
 from vllm.model_executor.custom_op import CustomOp
 from vllm.utils import is_hpu
-
+from vllm.logger import init_logger
+logger = init_logger(__name__)
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
+        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as HPUFusedRMSNorm
     except ImportError:
-        print("Not using HPU fused kernel for RMSNorm")
-        FusedRMSNorm = None
+        logger.warning("Could not import HPU FusedRMSNorm kernel. "
+                       "vLLM will use forward_native implementation of RMSNorm.")
+        HPUFusedRMSNorm = None
 
 
 class RMSNorm(CustomOp):
@@ -80,37 +82,21 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm._ipex_ops import ipex_ops as ops
-
+        if HPUFusedRMSNorm is None:
+            return self.forward_native(x, residual)
         if residual is not None:
-            if x.device.type == "hpu" and FusedRMSNorm:
-                orig_dtype = x.dtype
-                orig_shape = x.shape
-                residual += x.view(residual.shape)
-                # Note: FusedRMSNorm requires 3D tensors as inputs
-                x = FusedRMSNorm.apply(residual.float(), self.weight.float(),
-                                       self.variance_epsilon)
-                return x.to(orig_dtype).view(orig_shape), residual
-            ops.fused_add_rms_norm(
-                x,
-                residual,
-                self.weight.data,
-                self.variance_epsilon,
-            )
-            return x, residual
-        if x.device.type == "hpu" and FusedRMSNorm:
             orig_dtype = x.dtype
-            x = FusedRMSNorm.apply(x.float(), self.weight.float(),
-                                   self.variance_epsilon)
-            return x.to(orig_dtype)
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
-            x,
-            self.weight.data,
-            self.variance_epsilon,
-        )
-        return out
+            orig_shape = x.shape
+            residual += x.view(residual.shape)
+            # Note: HPUFusedRMSNorm requires 3D tensors as inputs
+            x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(),
+                                    self.variance_epsilon)
+            return x.to(orig_dtype).view(orig_shape), residual
+    
+        orig_dtype = x.dtype
+        x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(),
+                                self.variance_epsilon)
+        return x.to(orig_dtype)
 
     def forward_xpu(
         self,

From 19993b7490a7c26aa5640b4f50607158754ddda0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 15:58:29 +0200
Subject: [PATCH 102/341] Refactor & re-enable HPU RoPE for Gaudi1 (#129)

* Re-enable FusedRoPE for Gaudi1

* add fallback impl of rope
---
 vllm/hpu/rotary_embed.py                      | 102 +++++-------------
 .../model_executor/layers/rotary_embedding.py |   8 +-
 2 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 26b19e8258285..8bc93cdf5c444 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -5,80 +5,25 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
-import habana_frameworks.torch.utils.experimental as htexp
 import torch
 import torch.nn as nn
+from vllm.utils import is_hpu
+from vllm.logger import init_logger
 
+logger = init_logger(__name__)
 
-def get_device_type():
-    return htexp._get_device_type()
-
-
-def is_gaudi1():
-    return get_device_type() == htexp.synDeviceType.synDeviceGaudi
-
-
-def is_gaudi2():
-    return get_device_type() == htexp.synDeviceType.synDeviceGaudi2
-
-
-def is_gaudi3():
-    return get_device_type() == htexp.synDeviceType.synDeviceGaudi3
-
-
-# TODO: remove this workaround when FusedRoPE properly works on Gaudi
-if not is_gaudi1() and (is_gaudi2() or is_gaudi3()):
+if is_hpu():
     try:
         from habana_frameworks.torch.hpex.kernels import (
             RotaryPosEmbeddingHelperV1 as FusedRoPE)
     except ImportError:
-        print("Not using HPU fused kernel for apply_rotary_pos_emb")
-        FusedRoPE = None
+        logger.warning("Could not import HPU FusedRoPE kernel. "
+                       "vLLM will use forward_native implementation of RoPE.")
+    FusedRoPE = None
 else:
     FusedRoPE = None
 
 
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and 
-            key tensors. For example, this can be used to pass offsetted 
-            position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to 
-            unsqueeze cos[position_ids] and sin[position_ids] so that they can 
-            be properly broadcasted to the dimensions of q and k. For example, 
-            note that cos[position_ids] and sin[position_ids] have the shape 
-            [batch_size, seq_len, head_dim]. Then, if q and k have the shape 
-            [batch_size, heads, seq_len, head_dim], then setting 
-            unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] 
-            broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set 
-            unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated 
-        using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids]  #.unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids]  #.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
 class HpuRotaryEmbedding(nn.Module):
 
     def __init__(self,
@@ -87,7 +32,8 @@ def __init__(self,
                  max_position_embeddings=2048,
                  base=10000,
                  is_neox_style=None,
-                 device='hpu'):
+                 device='hpu',
+                 RoPEFallback=None):
         super().__init__()
 
         self.head_size = head_size
@@ -102,6 +48,14 @@ def __init__(self,
         self._set_cos_sin_cache(seq_len=max_position_embeddings,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
+        if FusedRoPE is None:
+            assert RoPEFallback is not None, "HPU FusedRoPE kernel could not be imported, and fallback RoPE implementation was not provided!"
+            self.fallback_impl = RoPEFallback(head_size,
+                                              rotary_dim,
+                                              max_position_embeddings,
+                                              base,
+                                              is_neox_style,
+                                              dtype=torch.get_default_dtype())
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
@@ -122,6 +76,8 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
     def forward(self, positions: torch.Tensor, query: torch.Tensor,
                 key: torch.Tensor):
+        if FusedRoPE is None:
+            return self.fallback_impl(positions, query, key)
         if query.dim() == 2:
             query = query.unsqueeze(0)
         if key.dim() == 2:
@@ -141,19 +97,15 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor,
              self.head_size))
         key = key.reshape((key.shape[0], key.shape[1],
                            key.shape[2] // self.head_size, self.head_size))
-        if query.device.type == "hpu" and FusedRoPE:
-            if len(positions[0]) == 1:
-                cos = self.cos_cached[positions].unsqueeze(2).to(
-                    dtype=query.dtype)
-                sin = self.sin_cached[positions].unsqueeze(2).to(
-                    dtype=query.dtype)
-            else:
-                cos = cos[positions].unsqueeze(2)
-                sin = sin[positions].unsqueeze(2)
-            query, key = FusedRoPE.apply(query, cos, sin,
-                                         0), FusedRoPE.apply(key, cos, sin, 0)
+
+        if len(positions[0]) == 1:
+            cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+            sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
         else:
-            query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
+            cos = cos[positions].unsqueeze(2)
+            sin = sin[positions].unsqueeze(2)
+        query, key = FusedRoPE.apply(query, cos, sin,
+                                     0), FusedRoPE.apply(key, cos, sin, 0)
         return query.reshape(
             (query.shape[0], query.shape[1],
              query.shape[2] * query.shape[3])), key.reshape(
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d706c70c82374..e7c97a6cf75cc 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -765,8 +765,12 @@ def get_rope(
         return _ROPE_DICT[key]
     if rope_scaling is None:
         if is_hpu():
-            rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim,
-                                            max_position, base, is_neox_style)
+            rotary_emb = HpuRotaryEmbedding(head_size,
+                                            rotary_dim,
+                                            max_position,
+                                            base,
+                                            is_neox_style,
+                                            RoPEFallback=RotaryEmbedding)
         else:
             rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position,
                                          base, is_neox_style, dtype)

From 03e3ce38d1fdef86fad30c5ee1bed27fce22a842 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 16:45:32 +0200
Subject: [PATCH 103/341] formatting fixes (#132)

---
 vllm/hpu/rotary_embed.py                |  7 +++++--
 vllm/model_executor/layers/layernorm.py | 17 ++++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 8bc93cdf5c444..e44bfa2f6210c 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -7,8 +7,9 @@
 
 import torch
 import torch.nn as nn
-from vllm.utils import is_hpu
+
 from vllm.logger import init_logger
+from vllm.utils import is_hpu
 
 logger = init_logger(__name__)
 
@@ -49,7 +50,9 @@ def __init__(self,
                                 device=self.inv_freq.device,
                                 dtype=torch.get_default_dtype())
         if FusedRoPE is None:
-            assert RoPEFallback is not None, "HPU FusedRoPE kernel could not be imported, and fallback RoPE implementation was not provided!"
+            assert RoPEFallback is not None, (
+                "HPU FusedRoPE kernel could not be imported, and "
+                "fallback RoPE implementation was not provided!")
             self.fallback_impl = RoPEFallback(head_size,
                                               rotary_dim,
                                               max_position_embeddings,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index f1b7a73d22d52..e00cb9ca6e1ac 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,16 +4,19 @@
 import torch
 import torch.nn as nn
 
+from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.utils import is_hpu
-from vllm.logger import init_logger
+
 logger = init_logger(__name__)
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as HPUFusedRMSNorm
+        from habana_frameworks.torch.hpex.normalization import (
+            FusedRMSNorm as HPUFusedRMSNorm)
     except ImportError:
-        logger.warning("Could not import HPU FusedRMSNorm kernel. "
-                       "vLLM will use forward_native implementation of RMSNorm.")
+        logger.warning(
+            "Could not import HPU FusedRMSNorm kernel. "
+            "vLLM will use forward_native implementation of RMSNorm.")
         HPUFusedRMSNorm = None
 
 
@@ -90,12 +93,12 @@ def forward_hpu(
             residual += x.view(residual.shape)
             # Note: HPUFusedRMSNorm requires 3D tensors as inputs
             x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(),
-                                    self.variance_epsilon)
+                                      self.variance_epsilon)
             return x.to(orig_dtype).view(orig_shape), residual
-    
+
         orig_dtype = x.dtype
         x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(),
-                                self.variance_epsilon)
+                                  self.variance_epsilon)
         return x.to(orig_dtype)
 
     def forward_xpu(

From a0646da3d1685847a90c894429a9da3572cbd063 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 17:40:34 +0200
Subject: [PATCH 104/341] Address upstream PR code review comments (#133)

* formatting fixes

* Upstream CR update
---
 .../getting_started/gaudi-installation.rst    | 62 ++++++++++++-------
 .../model_executor/layers/logits_processor.py |  2 +
 .../layers/vocab_parallel_embedding.py        |  5 +-
 vllm/worker/cache_engine.py                   |  5 +-
 4 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 73b63b3f8d755..a9f3ebdf274f6 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -1,8 +1,7 @@
-vLLM with Intel® Gaudi® 2 AI Accelerators
+vLLM with Intel® Gaudi® AI Accelerators
 =========================================
 
-This README provides instructions on running vLLM with Intel Gaudi
-devices.
+This README provides instructions on running vLLM with Intel Gaudi devices.
 
 Requirements and Installation
 =============================
@@ -13,17 +12,13 @@ to set up the environment. To achieve the best performance, please
 follow the methods outlined in the `Optimizing Training Platform
 Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
 
-.. note:: 
-   In this release (1.16.0), we are only targeting functionality
-   and accuracy. Performance will be improved in next releases.
-
 Requirements
 ------------
 
 -  OS: Ubuntu 22.04 LTS
 -  Python: 3.10
--  Intel Gaudi 2 accelerator
--  Intel Gaudi software version 1.16.0
+-  Intel Gaudi accelerator
+-  Intel Gaudi software version 1.16.0 or newer
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -49,20 +44,30 @@ Use the following commands to run a Docker image:
 
 .. code:: console
 
-   $ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+   $ docker pull vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
 
-Build and Install vLLM-fork
+Build and Install vLLM
 ---------------------------
 
-To build and install vLLM-fork from source, run:
+To build and install vLLM from source, run:
+
+.. code:: console
+
+   $ git clone https://github.com/vllm-project/vllm.git
+   $ cd vllm
+   $ python setup.py develop
+
+
+Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
 
 .. code:: console
 
    $ git clone https://github.com/HabanaAI/vllm-fork.git
    $ cd vllm-fork
-   # git checkout v0.4.2-Gaudi-1.16.0
-   $ pip install -e .  # This may take 5-10 minutes.
+   $ git checkout habana_main
+   $ python setup.py develop
+
 
 Supported Features
 ==================
@@ -72,13 +77,12 @@ Supported Features
 -  Online inference via `OpenAI-Compatible
    Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
 -  HPU autodetection - no need to manually select device within vLLM
--  Paged KV cache with algorithms enabled for Intel Gaudi 2 accelerators
+-  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 -  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
    prefill attention, Root Mean Square Layer Normalization, Rotary
    Positional Encoding
 -  Tensor parallelism support for multi-card inference
--  Inference with `HPU
-   Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
+-  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
 
 Unsupported Features
@@ -94,7 +98,7 @@ Supported Configurations
 ========================
 
 The following configurations have been validated to be function with
-Gaudi devices. Configurations that are not listed may or may not work.
+Gaudi2 devices. Configurations that are not listed may or may not work.
 
 -  `meta-llama/Llama-2-7b <https://huggingface.co/meta-llama/Llama-2-7b>`__
    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
@@ -102,12 +106,24 @@ Gaudi devices. Configurations that are not listed may or may not work.
 -  `meta-llama/Llama-2-7b-chat-hf <https://huggingface.co/meta-llama/Llama-2-7b-chat-hf>`__
    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
    datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
 -  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
-   with tensor parallelism on 8x HPU, BF16 datatype with random or
-   greedy sampling
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 -  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
-   with tensor parallelism 8x HPU, BF16 datatype with random or greedy
-   sampling
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-70B <https://huggingface.co/meta-llama/Meta-Llama-3-70B>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `mistralai/Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__
+   on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
+-  `mistralai/Mixtral-8x7B-Instruct-v0.1 <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`__
+   with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
 
 Performance Tips
 ================
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index e87ecbe40fdca..3b4fc88a8ca51 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -93,6 +93,8 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
+    # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup, 
+    # profile_run) we might not have selected_token_indices, so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:
         return hidden_states.index_select(
             0, sampling_metadata.selected_token_indices)
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 7860ec511571b..6cf79d462bfe0 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -329,9 +329,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # Copy the data.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
-        # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so
-        # we're using a workaround. Remove this when fixed in HPU PT bridge.
         if is_hpu():
+            # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
+            # so we're using a workaround. Remove this when fixed in
+            # HPU PT bridge.
             padded_weight = torch.cat([
                 loaded_weight,
                 torch.zeros(param.shape[0] - loaded_weight.shape[0],
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d2b1891c7e28c..93be2f4c321fe 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,12 +6,9 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_hpu,
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available)
 
-if is_hpu():
-    pass
-
 logger = init_logger(__name__)
 
 
From a642c0cfc4f59041830acda3aae93060d8dc5aff Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 17:47:43 +0200
Subject: [PATCH 105/341] Whitespace fix (#134)

* formatting fixes

* Upstream CR update

* whitespace fix
---
 vllm/model_executor/layers/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 3b4fc88a8ca51..cce8f99af5a6c 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -93,7 +93,7 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup, 
+    # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup,
     # profile_run) we might not have selected_token_indices, so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:
         return hidden_states.index_select(

From 58236e7f745bdd2759ea06fe79f8beaab67bf1be Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 20:14:51 +0300
Subject: [PATCH 106/341] formatting

---
 vllm/executor/ray_utils.py               | 9 ++++-----
 vllm/model_executor/sampling_metadata.py | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index b52324648865a..507dc04f48123 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_ip, is_hip, is_tpu, is_hpu, is_xpu
+from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -87,18 +87,17 @@ def initialize_ray_cluster(
                  ignore_reinit_error=True,
                  num_gpus=parallel_config.world_size)
     else:
-        ray.init(address=ray_address,
-                 ignore_reinit_error=True)
+        ray.init(address=ray_address, ignore_reinit_error=True)
 
     if parallel_config.placement_group:
         # Placement group is already set.
         return
 
-    device_str = "GPU" 
+    device_str = "GPU"
     if is_tpu():
         device_str = "TPU"
     elif is_hpu():
-        device_str = "HPU" 
+        device_str = "HPU"
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 98e77c22b3875..4687eb5c39fa9 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -8,8 +8,8 @@
 from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData, SequenceGroupMetadata
-from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
-                        make_tensor_with_pad, maybe_expand_dim, is_hpu)
+from vllm.utils import (async_tensor_h2d, is_hpu, is_pin_memory_available,
+                        make_tensor_with_pad, maybe_expand_dim)
 
 _SAMPLING_EPS = 1e-5
 _SEED_0_REPLACEMENT = 3403598558

From cc01748cd73fee1c912866f8c18b4d7e43d81a1b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 20:24:33 +0300
Subject: [PATCH 107/341] align to changes in make_tensor_with_pad

---
 vllm/worker/habana_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 15ac0035228cc..11a7630b3cebe 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -646,19 +646,19 @@ def _prepare_prompt(
             self.block_size)
 
         input_tokens = make_tensor_with_pad(input_tokens,
-                                            max_prompt_len,
+                                            max_len=max_prompt_len,
                                             pad=0,
                                             dtype=torch.long,
                                             device=self.device)
 
         input_positions = make_tensor_with_pad(input_positions,
-                                               max_prompt_len,
+                                               max_len=max_prompt_len,
                                                pad=0,
                                                dtype=torch.long,
                                                device=self.device)
 
         slot_mapping = make_tensor_with_pad(slot_mapping,
-                                            max_prompt_len,
+                                            max_len=max_prompt_len,
                                             pad=_PAD_SLOT_ID,
                                             dtype=torch.long,
                                             device=self.device)

From bf86cb1c19b5fc1334bdb7bce86baf331871936d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 29 Jul 2024 20:25:20 +0300
Subject: [PATCH 108/341] update ray_habana_executor

---
 vllm/executor/ray_habana_executor.py | 306 +++++++++++++++++----------
 1 file changed, 198 insertions(+), 108 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 96b08a4dd3895..a616e59b3be60 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import pickle
 from collections import defaultdict
 from itertools import islice, repeat
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -11,7 +10,8 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (error_on_invalid_device_count_status,
+from vllm.utils import (_run_task_with_lock,
+                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_ip, get_open_port,
                         get_vllm_instance_id, make_async)
 
@@ -28,8 +28,31 @@
 
 class RayHabanaExecutor(DistributedGPUExecutor):
 
+    uses_ray: bool = True
+
     def _init_executor(self) -> None:
-        assert self.parallel_config.distributed_executor_backend == "ray"
+        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        # If the env var is set, it uses the Ray's compiled DAG API
+        # which optimizes the control plane overhead.
+        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+        # Currently, this requires USE_RAY_SPMD_WORKER=True.
+        self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG
+        # If the env var is set, then we do not distinguish between the
+        # "driver worker" vs other workers. Also, the rank 0 worker will
+        # be executed in a remote Ray worker. Currently this requires
+        # USE_RAY_COMPILED_DAG=True.
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if self.use_ray_compiled_dag:
+            assert self.use_ray_spmd_worker, (
+                "VLLM_USE_RAY_COMPILED_DAG=1 requires "
+                "VLLM_USE_RAY_SPMD_WORKER=1")
+        if self.use_ray_spmd_worker:
+            # TODO: Support SPMD worker for non-DAG Ray executor.
+            assert self.use_ray_compiled_dag, (
+                "VLLM_USE_RAY_SPMD_WORKER=1 requires "
+                "VLLM_USE_RAY_COMPILED_DAG=1")
+
+        assert self.uses_ray
         placement_group = self.parallel_config.placement_group
 
         # Disable Ray usage stats collection.
@@ -40,19 +63,19 @@ def _init_executor(self) -> None:
         # Create the parallel GPU workers.
         self._init_workers_ray(placement_group)
 
-        self.forward_dag = None
-        if USE_RAY_COMPILED_DAG:
-            self.forward_dag = self._compiled_ray_dag()
+    def _get_worker_wrapper_args(self) -> Dict[str, Any]:
+        worker_module_name = "vllm.worker.habana_worker"
+        worker_class_name = "HabanaWorker"
+
+        return dict(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if (self.parallel_config.tensor_parallel_size == 1
-                and self.parallel_config.pipeline_parallel_size == 1):
-            # For single GPU case, we use a ray worker with constrained memory.
-            num_gpus = 1
-        else:
-            # Otherwise, the ray workers are allocated with a full GPU.
-            num_gpus = 1
+        num_gpus = 1
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
@@ -62,6 +85,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Create the workers.
         driver_ip = get_ip()
+        worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
             if not bundle.get("HPU", 0):
                 continue
@@ -70,33 +94,30 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 placement_group_capture_child_tasks=True,
                 placement_group_bundle_index=bundle_id,
             )
+
             worker = ray.remote(
                 num_cpus=0,
                 num_gpus=0,
                 resources={'HPU': num_gpus},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
-            )(RayWorkerWrapper).remote(
-                worker_module_name="vllm.worker.habana_worker",
-                worker_class_name="HabanaWorker",
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
+            )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
 
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-                self.driver_worker = RayWorkerWrapper(
-                    worker_module_name="vllm.worker.habana_worker",
-                    worker_class_name="HabanaWorker",
-                    trust_remote_code=self.model_config.trust_remote_code,
-                )
-            else:
-                # Else, added to the list of workers.
+            if self.use_ray_spmd_worker:
                 self.workers.append(worker)
+            else:
+                worker_ip = ray.get(worker.get_node_ip.remote())
+                if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                    # If the worker is on the same node as the driver, we use it
+                    # as the resource holder for the driver process.
+                    self.driver_dummy_worker = worker
+                    self.driver_worker = RayWorkerWrapper(
+                        **worker_wrapper_kwargs)
+                else:
+                    # Else, added to the list of workers.
+                    self.workers.append(worker)
 
-        if self.driver_dummy_worker is None:
+        if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
                 "Ray does not allocate any GPUs on the driver node. Consider "
                 "adjusting the Ray placement group or running the driver on a "
@@ -106,11 +127,32 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                     use_dummy_driver=True)
 
-        node_workers = defaultdict(list)
-        node_gpus = defaultdict(list)
-
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
-            node_workers[node_id].append(i)
+        # the order in `worker_node_and_gpu_ids` does not necessarily match
+        # the machine boundaries. We need to make sure that workers in the
+        # same node are assigned consecutive ranks.
+        # examples:
+        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
+
+        # initialize worker ranks with -1 (unassigned)
+        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
+        current_rank = 0
+        while -1 in worker_ranks:
+            # whenever we find an unassigned worker, find the node
+            index = worker_ranks.index(-1)
+            current_node_id = worker_node_and_gpu_ids[index][0]
+            # assign ranks to all workers in the same node
+            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+                if node_id == current_node_id:
+                    worker_ranks[i] = current_rank
+                    current_rank += 1
+        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
+
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+
+        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
+                                                   worker_node_and_gpu_ids):
+            node_workers[node_id].append(worker_rank)
             # `gpu_ids` can be a list of strings or integers.
             # convert them to integers for consistency.
             # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
@@ -129,6 +171,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 'true'
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
@@ -154,7 +197,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 local_rank=node_workers[node_id].index(rank),
                 rank=rank,
                 distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+            ) for rank, (node_id,
+                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
         ]
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
@@ -172,16 +216,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         # broadcasted to.
         self.non_driver_workers: List[RayWorkerWrapper] = []
 
-        for pp_rank in range(self.parallel_config.pipeline_parallel_size):
-            for tp_rank in range(self.parallel_config.tensor_parallel_size):
-                rank = (pp_rank *
-                        self.parallel_config.tensor_parallel_size) + tp_rank
-                if rank == 0:
-                    pass
-                elif rank % self.parallel_config.tensor_parallel_size == 0:
-                    self.tp_driver_workers.append(self.workers[rank - 1])
-                else:
-                    self.non_driver_workers.append(self.workers[rank - 1])
+        # Enforce rank order for correct rank to return final output.
+        for rank, worker in sorted(zip(worker_ranks[1:], self.workers)):
+            # We need to skip the driver worker, which we
+            # do by skipping worker_ranks[0] which is always 0.
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(worker)
+            else:
+                self.non_driver_workers.append(worker)
 
     def _driver_execute_model(
         self, execute_model_req: Optional[ExecuteModelRequest]
@@ -191,9 +233,23 @@ def _driver_execute_model(
         Passing None will cause the driver to stop the model execution
         loop running in each of the remote workers.
         """
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
         return self.driver_worker.execute_method("execute_model",
                                                  execute_model_req)
 
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return super().execute_model(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        outputs = ray.get(self.forward_dag.execute(execute_model_req))
+        return outputs[0]
+
     def _run_workers(
         self,
         method: str,
@@ -203,7 +259,6 @@ def _run_workers(
         all_kwargs: Optional[List[Dict[str, Any]]] = None,
         use_dummy_driver: bool = False,
         max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers. Can be used in the following
@@ -218,6 +273,10 @@ def _run_workers(
         - all_args/all_kwargs: args/kwargs for each worker are specified
           individually
         """
+        if self.use_ray_spmd_worker:
+            assert not async_run_tensor_parallel_workers_only, (
+                "async_run_tensor_parallel_workers_only is not supported for "
+                "spmd mode.")
 
         if max_concurrent_workers:
             raise NotImplementedError(
@@ -226,99 +285,125 @@ def _run_workers(
         count = len(self.workers) if not \
             async_run_tensor_parallel_workers_only \
             else len(self.non_driver_workers)
+        # If using SPMD worker, all workers are the same, so we should execute
+        # the args on all workers. Otherwise, we skip the first worker's args
+        # because those args will go to the driver worker.
+        first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1
         all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
+            else islice(all_args, first_worker_args_index, None)
         all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
-
-        if use_ray_compiled_dag:
-            # Right now, compiled DAG can only accept a single
-            # input. TODO(sang): Fix it.
-            assert self.forward_dag is not None
-            output_channels = self.forward_dag.execute(1)
-            ray_worker_outputs = []
-        else:
-            # Start the ray workers first.
-            ray_workers = self.workers
-            if async_run_tensor_parallel_workers_only:
-                ray_workers = self.non_driver_workers
-            ray_worker_outputs = [
-                worker.execute_method.remote(method, *worker_args,
-                                             **worker_kwargs)
-                for (worker, worker_args, worker_kwargs
-                     ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
-            ]
+            else islice(all_kwargs, first_worker_args_index, None)
+
+        # Start the ray workers first.
+        ray_workers = self.workers
+        if async_run_tensor_parallel_workers_only:
+            ray_workers = self.non_driver_workers
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *worker_args, **worker_kwargs)
+            for (worker, worker_args, worker_kwargs
+                 ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
+        ]
 
         if async_run_tensor_parallel_workers_only:
             # Just return futures
             return ray_worker_outputs
 
-        driver_args = args if all_args is None else all_args[0]
-        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
-
-        # Start the driver worker after all the ray workers.
-        if not use_dummy_driver:
-            driver_worker_output = self.driver_worker.execute_method(
-                method, *driver_args, **driver_kwargs)
-        else:
-            assert self.driver_dummy_worker is not None
-            driver_worker_output = ray.get(
-                self.driver_dummy_worker.execute_method.remote(
-                    method, *driver_args, **driver_kwargs))
+        driver_worker_output = []
+        # In SPMD mode, the driver worker is the same as any other worker,
+        # so we only explicitly execute on the driver worker if using a
+        # non-SPMD worker class.
+        if not self.use_ray_spmd_worker:
+            driver_args = args if all_args is None else all_args[0]
+            driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+            # Start the driver worker after all the ray workers.
+            if not use_dummy_driver:
+                driver_worker_output = [
+                    self.driver_worker.execute_method(method, *driver_args,
+                                                      **driver_kwargs)
+                ]
+            else:
+                assert self.driver_dummy_worker is not None
+                driver_worker_output = [
+                    ray.get(
+                        self.driver_dummy_worker.execute_method.remote(
+                            method, *driver_args, **driver_kwargs))
+                ]
+
         # Get the results of the ray workers.
         if self.workers:
-            if use_ray_compiled_dag:
-                try:
-                    ray_worker_outputs = [
-                        pickle.loads(chan.begin_read())
-                        for chan in output_channels
-                    ]
-                finally:
-                    # Has to call end_read in order to reuse the DAG.
-                    for chan in output_channels:
-                        chan.end_read()
-            else:
-                ray_worker_outputs = ray.get(ray_worker_outputs)
+            ray_worker_outputs = ray.get(ray_worker_outputs)
 
-        return [driver_worker_output] + ray_worker_outputs
+        return driver_worker_output + ray_worker_outputs
 
     def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         """Wait for futures returned from _run_workers() with
         async_run_remote_workers_only to complete."""
         ray.get(parallel_worker_tasks)
 
-    def _compiled_ray_dag(self):
+    def _compiled_ray_dag(self, enable_asyncio: bool):
         import pkg_resources
-        required_version = "2.9"
-        current_version = pkg_resources.get_distribution("ray").version
+        from packaging import version
+
+        required_version = version.parse("2.32")
+        current_version = version.parse(
+            pkg_resources.get_distribution("ray").version)
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} or greater is "
                              f"required, but found {current_version}")
 
         from ray.dag import InputNode, MultiOutputNode
-        assert self.parallel_config.distributed_executor_backend == "ray"
+        assert self.parallel_config.use_ray
 
         # Right now, compiled DAG requires at least 1 arg. We send
         # a dummy value for now. It will be fixed soon.
         with InputNode() as input_data:
             forward_dag = MultiOutputNode([
-                worker.execute_model_compiled_dag_remote.
-                bind(  # type: ignore[attr-defined]
+                worker.execute_model_spmd.bind(  # type: ignore[attr-defined]
                     input_data) for worker in self.workers
             ])
-        return forward_dag.experimental_compile()
+        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
+
+    def __del__(self):
+        if self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
 
 
 class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
+        if not self.use_ray_compiled_dag:
+            self.driver_exec_method = make_async(
+                self.driver_worker.execute_method)
+
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        if not self.use_ray_spmd_worker:
+            return await super().execute_model_async(execute_model_req)
+
+        if self.forward_dag is None:
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=True)
+
+        dag_future = await self.forward_dag.execute_async(execute_model_req)
+        outputs = await dag_future
+        return outputs[0]
 
     async def _driver_execute_model_async(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None
     ) -> List[SamplerOutput]:
+        assert not self.use_ray_spmd_worker, (
+            "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1")
+        if not self.tp_driver_workers:
+            return await self.driver_exec_method("execute_model",
+                                                 execute_model_req)
         if self.pp_locks is None:
             # This locks each pipeline parallel stage so multiple virtual
             # engines can't execute on the same stage at the same time
@@ -329,15 +414,11 @@ async def _driver_execute_model_async(
                 for _ in range(self.parallel_config.pipeline_parallel_size)
             ]
 
-        async def _run_task_with_lock(task, lock, *args, **kwargs):
-            async with lock:
-                return await task(*args, **kwargs)
-
-        tasks = []
-        tasks.append(
+        tasks = [
             asyncio.create_task(
                 _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
-                                    "execute_model", execute_model_req)))
+                                    "execute_model", execute_model_req))
+        ]
         for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
                                                 start=1):
             tasks.append(
@@ -352,8 +433,17 @@ async def _run_task_with_lock(task, lock, *args, **kwargs):
         return results[-1]
 
     async def _start_worker_execution_loop(self):
+        assert not self.use_ray_spmd_worker, (
+            "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1")
         coros = [
             worker.execute_method.remote("start_worker_execution_loop")
             for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
+
+    def __del__(self):
+        if self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)

From cf7cf029def29c40bc470440e7a1e27c5e18ddc7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 30 Jul 2024 10:26:33 +0200
Subject: [PATCH 109/341] Add torch.compile support (#48)

* Remove usage of wrap_in_hpu_graph in PT eager

* Add torch.compile support

* Update habana_model_runner.py

* format.sh pass

* do not warmup graphs in non-lazy backend
---
 vllm/worker/habana_model_runner.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 15ac0035228cc..57a3cf18658d9 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -149,8 +149,12 @@ def align_workers(value, op):
 
 class HpuModelAdapter():
 
-    def __init__(self, model):
+    def __init__(self, model, enforce_eager):
         self.model = model
+        if not htorch.utils.internal.is_lazy() and not enforce_eager:
+            self.model = torch.compile(self.model,
+                                       backend='hpu_backend',
+                                       dynamic=False)
 
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
@@ -428,7 +432,8 @@ def load_model(self) -> None:
             # FIXME: Running with disable_tensor_cache=True causes
             # RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
-                self.model = _maybe_wrap_in_hpu_graph(self.model)
+                self.model = _maybe_wrap_in_hpu_graph(
+                    self.model, enforce_eager=self.enforce_eager)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)
 
@@ -1118,7 +1123,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
         self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 
-        if not self.enforce_eager:
+        if not self.enforce_eager and htorch.utils.internal.is_lazy():
             mem_margin = 1.0 - float(
                 os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
             free_mem = \
@@ -1150,9 +1155,11 @@ def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
 
-def _maybe_wrap_in_hpu_graph(model):
+def _maybe_wrap_in_hpu_graph(*args, **kwargs):
     return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(
-        model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model)
+        *args, **
+        kwargs)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
+            *args, **kwargs)
 
 
 class HabanaProfilerCounterHelper():

From 2f675f32170685e2f526ec07413bfb05bf39eaf6 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 30 Jul 2024 15:35:53 +0300
Subject: [PATCH 110/341] use_ray fix

---
 vllm/executor/habana_executor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 8750c3b00dd9e..f5cf26b687053 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -20,6 +20,8 @@
 
 class HabanaExecutor(ExecutorBase):
 
+    uses_ray: bool = False
+
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
         self._init_worker()

From 16af1c7a8dbdafaab49c9e922e1f7a8ffdf1b89b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 30 Jul 2024 15:36:06 +0300
Subject: [PATCH 111/341] formatting fixes

---
 vllm/executor/ray_habana_executor.py    | 1 -
 vllm/model_executor/layers/layernorm.py | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index a616e59b3be60..9e0a89cbeb8aa 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -171,7 +171,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             VLLM_INSTANCE_ID,
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
-            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 'true'
         }, ) for (node_id, _) in worker_node_and_gpu_ids]
         self._run_workers("update_environment_variables",
                           all_args=all_args_to_update_environment_variables)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index e00cb9ca6e1ac..01429d2fcbd17 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -11,8 +11,9 @@
 logger = init_logger(__name__)
 if is_hpu():
     try:
-        from habana_frameworks.torch.hpex.normalization import (
-            FusedRMSNorm as HPUFusedRMSNorm)
+        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
+                                                                HPUFusedRMSNorm
+                                                                )
     except ImportError:
         logger.warning(
             "Could not import HPU FusedRMSNorm kernel. "

From e5e59a121eb87039c75f661e9557171dcf7f638e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 30 Jul 2024 15:36:20 +0300
Subject: [PATCH 112/341] make block size 128 default on gaudi

---
 vllm/engine/arg_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cb0148cce0fe9..e4b223a1b505f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -12,7 +12,7 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, is_hpu
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
@@ -52,7 +52,9 @@ class EngineArgs:
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
-    block_size: int = 16
+    # NOTE(kzawora): default block size for Gaudi should be 128
+    # smaller sizes still work, but very inefficiently
+    block_size: int = 16 if not is_hpu() else 128
     enable_prefix_caching: bool = False
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = False

From 030a2cb8446e9282c56abf6f37adc02a8f230474 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 30 Jul 2024 16:21:47 +0300
Subject: [PATCH 113/341] Add constraints for HPU UnquantizedFusedMoEMethod

---
 vllm/model_executor/layers/fused_moe/layer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fcffe7cc76db7..b49bf40d4746e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -108,6 +108,10 @@ def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                     router_logits: torch.Tensor, top_k: int, renormalize: bool,
                     use_grouped_topk: bool, num_expert_group: Optional[int],
                     topk_group: Optional[int]):
+        assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
+        assert num_expert_group is None, ('num_expert_group is '
+                                          'not supported on HPU')
+        assert topk_group is None, 'topk_group is not supported on HPU'
         return static_fused_moe(x, w1, w2, router_logits, top_k)
 
     def forward_cpu(self, *args, **kwargs):

From 2ccf56e468fdd6576425ea0fd8f0ee9fa8c80619 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 31 Jul 2024 10:28:02 +0200
Subject: [PATCH 114/341] Remove redundant torch.device (#139)

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index cbe9ebf35f4dd..bbe49655020da 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -276,7 +276,7 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(torch.device(device_config.device)):
+            with torch.device(device_config.device):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config, scheduler_config)

From bc1af91125e763cb84d5ecfb5387aae0631c7bde Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 31 Jul 2024 10:28:16 +0200
Subject: [PATCH 115/341] Add functools.wraps decorator to with_mark_steps
 (#138)

* Add functools.wraps decorator to with_mark_steps

* i cant use functools.wraps properly it seems
---
 vllm/hpu/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
index 06f3690aded8b..b7b435c50c295 100644
--- a/vllm/hpu/utils.py
+++ b/vllm/hpu/utils.py
@@ -5,11 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
+from functools import wraps
+
 import habana_frameworks.torch as htorch
 
 
 def with_mark_steps(fn):
 
+    @wraps(fn)
     def wrapped(*args, **kwargs):
         htorch.core.mark_step()
         result = fn(*args, **kwargs)

From 5c7187d8e726c72689abc26a59a6cece757c3e22 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 31 Jul 2024 10:30:02 +0200
Subject: [PATCH 116/341] Add HPU platform and HpuCommunicator for TP (#136)

* Add HPU platform and HpuCommunicator for TP

* remove print

* whoopsie I forgot to add vllm/platforms/__init__.py

* format.sh
---
 vllm/distributed/communication_op.py          | 10 ----
 .../device_communicators/hpu_communicator.py  | 48 +++++++++++++++++++
 vllm/distributed/parallel_state.py            | 20 ++++++++
 vllm/platforms/__init__.py                    |  5 +-
 vllm/platforms/hpu.py                         | 17 +++++++
 vllm/platforms/interface.py                   |  4 ++
 6 files changed, 93 insertions(+), 11 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/hpu_communicator.py
 create mode 100644 vllm/platforms/hpu.py

diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 63c159fce3d71..32394a07b00b9 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -3,21 +3,11 @@
 import torch
 import torch.distributed
 
-from vllm.utils import is_hpu
-
 from .parallel_state import get_tp_group
 
-if is_hpu():
-    import habana_frameworks.torch as htorch
-
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     """All-reduce the input tensor across model parallel group."""
-    if is_hpu():
-        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
-        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
-        # (which is required for tensor parallel HPUGraph inference)
-        htorch.core.mark_step()
     return get_tp_group().all_reduce(input_)
 
 
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
new file mode 100644
index 0000000000000..cc9b19ce022b5
--- /dev/null
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -0,0 +1,48 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+if current_platform.is_hpu():
+    import habana_frameworks.torch as htorch  # noqa: F401
+
+
+class HpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_hpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
+        htorch.core.mark_step()
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=x.dtype,
+                                    device=x.device)
+        # All-gather.
+        htorch.core.mark_step()
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 4116b1729d188..4971216d450d1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -134,6 +134,7 @@ def __init__(
         use_pynccl: bool,
         use_custom_allreduce: bool,
         use_tpu_communicator: bool,
+        use_hpu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
     ):
 
@@ -166,6 +167,7 @@ def __init__(
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
         self.use_tpu_communicator = use_tpu_communicator
+        self.use_hpu_communicator = use_hpu_communicator
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -198,6 +200,12 @@ def __init__(
         if use_tpu_communicator and self.world_size > 1:
             self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
 
+        from vllm.distributed.device_communicators.hpu_communicator import (
+            HpuCommunicator)
+        self.hpu_communicator: Optional[HpuCommunicator]
+        if use_hpu_communicator and self.world_size > 1:
+            self.hpu_communicator = HpuCommunicator(group=self.device_group)
+
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -303,6 +311,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if tpu_comm is not None and not tpu_comm.disabled:
             return tpu_comm.all_reduce(input_)
 
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_reduce(input_)
+
         if ca_comm is not None:
             out = ca_comm.custom_all_reduce(input_)
             if out is not None:
@@ -330,6 +343,11 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         if tpu_comm is not None and not tpu_comm.disabled:
             return tpu_comm.all_gather(input_, dim)
 
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
@@ -748,6 +766,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         use_pynccl=False,
         use_custom_allreduce=False,
         use_tpu_communicator=False,
+        use_hpu_communicator=False,
     )
 
 
@@ -767,6 +786,7 @@ def init_model_parallel_group(
         use_pynccl=True,
         use_custom_allreduce=use_custom_allreduce,
         use_tpu_communicator=True,
+        use_hpu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
     )
 
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index eac917786bd6b..8ca674af8d479 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from vllm.utils import is_tpu
+from vllm.utils import is_hpu, is_tpu
 
 from .interface import Platform, PlatformEnum, UnspecifiedPlatform
 
@@ -17,6 +17,9 @@
 elif is_tpu():
     from .tpu import TpuPlatform
     current_platform = TpuPlatform()
+elif is_hpu():
+    from .hpu import HpuPlatform
+    current_platform = HpuPlatform()
 else:
     current_platform = UnspecifiedPlatform()
 
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
new file mode 100644
index 0000000000000..45f2b95e704d6
--- /dev/null
+++ b/vllm/platforms/hpu.py
@@ -0,0 +1,17 @@
+from typing import Tuple
+
+import torch
+
+from .interface import Platform, PlatformEnum
+
+
+class HpuPlatform(Platform):
+    _enum = PlatformEnum.HPU
+
+    @staticmethod
+    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+        raise RuntimeError("HPU does not have device capability.")
+
+    @staticmethod
+    def inference_mode():
+        return torch.no_grad()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0760f9554fb78..3c7b4dc858327 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum):
     CUDA = enum.auto()
     ROCM = enum.auto()
     TPU = enum.auto()
+    HPU = enum.auto()
     UNSPECIFIED = enum.auto()
 
 
@@ -23,6 +24,9 @@ def is_rocm(self) -> bool:
     def is_tpu(self) -> bool:
         return self._enum == PlatformEnum.TPU
 
+    def is_hpu(self) -> bool:
+        return self._enum == PlatformEnum.HPU
+
     @staticmethod
     def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         raise NotImplementedError

From 667c7f3c0808edd9d204872f34addf5b22bed134 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 5 Aug 2024 10:15:43 +0200
Subject: [PATCH 117/341] Re-enable FusedRoPE (#145)

---
 vllm/hpu/rotary_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index e44bfa2f6210c..30a88d68a24af 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -20,7 +20,7 @@
     except ImportError:
         logger.warning("Could not import HPU FusedRoPE kernel. "
                        "vLLM will use forward_native implementation of RoPE.")
-    FusedRoPE = None
+        FusedRoPE = None
 else:
     FusedRoPE = None
 

From 14c20a333c908fa3991fa18ef970023464eb752f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 5 Aug 2024 11:00:34 +0200
Subject: [PATCH 118/341] Overhaul HPU memory management in HPUGraph capture
 (#147)

* Log more HPU memory metrics during vLLM startup

* Overhaul memory management in HPUGraph capture

* fix percentage in decode buckets
---
 vllm/utils.py                      |   2 +-
 vllm/worker/habana_model_runner.py | 102 +++++++++++++++++++++++------
 vllm/worker/habana_worker.py       |  42 +++++++++---
 3 files changed, 118 insertions(+), 28 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c1d0f37eb154f..8a1bc5de03eb7 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -664,7 +664,7 @@ def get_summary_string(self):
         return (
             f"{format_bytes(self.consumed_device_memory)} of device memory "
             f"({format_bytes(self.final_device_memory)}/"
-            f"({format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
+            f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
             f" and {format_bytes(self.consumed_host_memory)} of host memory "
             f"({format_bytes(self.final_host_memory)}/"
             f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 8a220e2ef0171..cf91c69069ed6 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -409,7 +409,7 @@ def __init__(
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
-
+        self._mem_margin: Optional[int] = None
         self._setup_buckets()
 
     def load_model(self) -> None:
@@ -1071,10 +1071,15 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
                             len(buckets), batch_size, seq_len)
             self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
 
-    def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
-                      available_mem):
-        total_batch_seq = 0.001
-        total_mem = 0
+    def warmup_graphs(self,
+                      strategy,
+                      buckets,
+                      is_prompt,
+                      kv_caches,
+                      available_mem,
+                      starting_mem=0,
+                      total_batch_seq=0.001):
+        total_mem = starting_mem
         idx = 0
         phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
         num_candidates = len(buckets)
@@ -1088,14 +1093,18 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
             raise NotImplementedError(
                 f'Unsupported graph allocation strategy: {strategy}')
         buckets = list(sorted(buckets, key=ordering))
-
+        captured_all = True
         for idx, (batch_size, seq_len) in enumerate(buckets):
             # Graph memory usage is proportional to seq dimension in a batch
             batch_seq = batch_size * seq_len if is_prompt else batch_size
             mem_estimate = batch_seq / total_batch_seq * total_mem
             if mem_estimate >= available_mem:
+                captured_all = False
+                continue
+            graphed_bucket = (batch_size, seq_len, is_prompt)
+            if graphed_bucket in self.graphed_buckets:
                 continue
-            self.graphed_buckets.add((batch_size, seq_len, is_prompt))
+            self.graphed_buckets.add(graphed_bucket)
             self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
             with HabanaMemoryProfiler() as mem_prof:
                 self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
@@ -1104,6 +1113,12 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches,
             available_mem -= used_mem
             total_mem += used_mem
             total_batch_seq += batch_seq
+
+        return total_mem, total_batch_seq, captured_all
+
+    def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
+        num_candidates = len(buckets)
+        phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
         graphed = list(c[:2] for c in self.graphed_buckets
                        if c[2] == is_prompt)
         msg = (f'{phase} captured:{len(graphed)} '
@@ -1124,22 +1139,63 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 
         if not self.enforce_eager and htorch.utils.internal.is_lazy():
-            mem_margin = 1.0 - float(
-                os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02'))
-            free_mem = \
-                mem_margin * HabanaMemoryProfiler.current_free_device_memory()
-            free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN)
+            assert self.mem_margin is not None, \
+                ("HabanaWorker.determine_num_available_blocks needs "
+                "to be called before warming up the model.")
+            free_mem = HabanaMemoryProfiler.current_free_device_memory()
+            graph_free_mem = free_mem - self.mem_margin
+            graph_free_mem = align_workers(graph_free_mem,
+                                           torch.distributed.ReduceOp.MIN)
             prompt_graph_mem_ratio = float(
                 os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
-            prompt_available_memory = prompt_graph_mem_ratio * free_mem
-            decode_available_memory = free_mem - prompt_available_memory
-            prompt_strategy = 'min_tokens'
+            prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem
+            decode_available_memory = graph_free_mem - prompt_available_memory
+            msg = (f"Using {format_bytes(graph_free_mem)}"
+                   f"/{format_bytes(free_mem)} "
+                   "of free device memory for HPUGraphs, "
+                   f"{format_bytes(prompt_available_memory)} for prompt and "
+                   f"{format_bytes(decode_available_memory)} for decode "
+                   f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})")
+            logger.info(msg)
+            prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY',
+                                             'min_tokens')
             decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
                                              'max_bs')
-            self.warmup_graphs(prompt_strategy, self.prompt_buckets, True,
-                               kv_caches, prompt_available_memory)
-            self.warmup_graphs(decode_strategy, self.decode_buckets, False,
-                               kv_caches, decode_available_memory)
+            mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
+                self.warmup_graphs(
+                prompt_strategy, self.prompt_buckets, True, kv_caches,
+                prompt_available_memory)
+            mem_post_decode, decode_batch_seq, decode_captured_all = \
+                self.warmup_graphs(
+                decode_strategy, self.decode_buckets, False, kv_caches,
+                decode_available_memory)
+
+            # Not all prompt buckets were captured, but all decode buckets were
+            # captured and we have some free graph-allocated space left.
+            # Let's try to use it for capturing more prompt buckets.
+            if mem_post_decode + mem_post_prompt < graph_free_mem \
+                and not prompt_captured_all \
+                    and decode_captured_all:
+                mem_post_prompt, _, prompt_captured_all = self.warmup_graphs(
+                    prompt_strategy, self.prompt_buckets, True, kv_caches,
+                    graph_free_mem - mem_post_prompt - mem_post_decode,
+                    mem_post_prompt, prompt_batch_seq)
+
+            # Not all decode buckets were captured, but all prompt buckets were
+            # captured and we have some free graph-allocated space left.
+            # Let's try to use it for capturing more decode buckets.
+            if mem_post_decode + mem_post_prompt < graph_free_mem \
+                and not decode_captured_all \
+                    and prompt_captured_all:
+                mem_post_decode, _, _ = self.warmup_graphs(
+                    decode_strategy, self.decode_buckets, False, kv_caches,
+                    graph_free_mem - mem_post_prompt - mem_post_decode,
+                    mem_post_decode, decode_batch_seq)
+
+            self.log_graph_warmup_summary(self.prompt_buckets, True,
+                                          mem_post_prompt)
+            self.log_graph_warmup_summary(self.decode_buckets, False,
+                                          mem_post_decode)
 
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()
@@ -1154,6 +1210,14 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+    @property
+    def mem_margin(self) -> Optional[int]:
+        return self._mem_margin
+
+    @mem_margin.setter
+    def mem_margin(self, value):
+        self._mem_margin = value
+
 
 def _maybe_wrap_in_hpu_graph(*args, **kwargs):
     return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 6be229e037d06..f3fdc4dcc63c6 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -16,14 +16,18 @@
                          SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
+from vllm.utils import HabanaMemoryProfiler, format_bytes
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
 from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
+logger = init_logger(__name__)
+
 
 class HabanaWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a HPU.
@@ -122,20 +126,37 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        self.model_runner.profile_run()
-        torch.hpu.synchronize()
-
+        with HabanaMemoryProfiler() as m:
+            self.model_runner.profile_run()
+            torch.hpu.synchronize()
+        msg = ("Model profiling run "
+               f"took {m.get_summary_string()}")
+        logger.info(msg)
         # At this point we should've allocated the maximum workspace for all
         # recipes we will use the extra memory for graphs/blocks
         free_hpu_memory = torch.hpu.mem_get_info()[0]
 
         cache_block_size = self.get_cache_block_size_bytes()
-        graph_headroom = 1 - (float(
+        graph_reserved_mem = (float(
             os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4'))
                               if not self.model_config.enforce_eager else 0)
-        num_hpu_blocks = int(free_hpu_memory * graph_headroom *
-                             self.cache_config.gpu_memory_utilization //
-                             cache_block_size)
+        graph_headroom = 1 - graph_reserved_mem
+        available_hpu_memory = free_hpu_memory * \
+            self.cache_config.gpu_memory_utilization
+        hpu_memory_margin = free_hpu_memory * (
+            1 - self.cache_config.gpu_memory_utilization)
+        self.model_runner.mem_margin = hpu_memory_margin
+        cache_size_bytes = available_hpu_memory * graph_headroom
+        graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom)
+        msg = (
+            f"Free device memory: {format_bytes(free_hpu_memory)}, "
+            f"{format_bytes(available_hpu_memory)} usable "
+            f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization}),"
+            f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs "
+            f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), "
+            f"{format_bytes(cache_size_bytes)} reserved for KV cache")
+        logger.info(msg)
+        num_hpu_blocks = int(cache_size_bytes // cache_block_size)
         num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                              cache_block_size)
         num_hpu_blocks = max(num_hpu_blocks, 0)
@@ -161,7 +182,12 @@ def initialize_cache(self, num_gpu_blocks: int,
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
-        self._init_cache_engine()
+        with HabanaMemoryProfiler() as m:
+            self._init_cache_engine()
+            torch.hpu.synchronize()
+        msg = ("Initializing cache engine "
+               f"took {m.get_summary_string()}")
+        logger.info(msg)
         self._warm_up_model()
 
     def _init_cache_engine(self):

From b43c7f915a28ac246aae39901a893eb9e8bed45b Mon Sep 17 00:00:00 2001
From: Karol Damaszke <kdamaszke@habana.ai>
Date: Tue, 6 Aug 2024 10:53:52 +0200
Subject: [PATCH 119/341] Allocate blocks from id=1 for HPU (#160)

---
 vllm/core/block/cpu_gpu_block_allocator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 5287cd9c1bfb3..a4805e4f342f9 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,7 +4,7 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device
+from vllm.utils import Device, is_hpu
 
 
 class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -52,7 +52,9 @@ def create(
             - The block IDs are assigned contiguously, with GPU block IDs coming
                 before CPU block IDs.
         """
-        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
+        # For HPU block ids cannot be equal to 0
+        start_id = 1 if is_hpu() else 0
+        block_ids = list(range(start_id, num_gpu_blocks + num_cpu_blocks))
         gpu_block_ids = block_ids[:num_gpu_blocks]
         cpu_block_ids = block_ids[num_gpu_blocks:]
 

From 37ca17f0097dae0a03fee6936062871ec49e2351 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 6 Aug 2024 15:55:24 +0200
Subject: [PATCH 120/341] Revert "Allocate blocks from id=1 for HPU (#160)"
 (#163)

This reverts commit b43c7f915a28ac246aae39901a893eb9e8bed45b.
---
 vllm/core/block/cpu_gpu_block_allocator.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index a4805e4f342f9..5287cd9c1bfb3 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,7 +4,7 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device, is_hpu
+from vllm.utils import Device
 
 
 class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -52,9 +52,7 @@ def create(
             - The block IDs are assigned contiguously, with GPU block IDs coming
                 before CPU block IDs.
         """
-        # For HPU block ids cannot be equal to 0
-        start_id = 1 if is_hpu() else 0
-        block_ids = list(range(start_id, num_gpu_blocks + num_cpu_blocks))
+        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
         gpu_block_ids = block_ids[:num_gpu_blocks]
         cpu_block_ids = block_ids[num_gpu_blocks:]
 

From 1f348b85459be2b12f9e86be95ef5a7179f641cf Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jkaniecki@habana.ai>
Date: Mon, 12 Aug 2024 14:54:04 +0200
Subject: [PATCH 121/341] Reimplement silu_and_mul for mixtral (#167)

* Reimplement silu and mul in mixtral

* Typo fix
---
 vllm/hpu/ops.py | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index bd737917cb919..3748eb3544dd1 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -16,13 +16,6 @@
 PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
 
 
-def silu_and_mul(output, input):
-    d = input.shape[-1] // 2
-    silu = torch.nn.SiLU().to(input.device)
-    x, y = torch.split(input, d, dim=-1)
-    output.copy_(silu(x) * y)
-
-
 def fetch_from_cache(cache, blocks, permutations):
     return [
         cache.index_select(0, blocks[:, i]).permute(permutations)
@@ -81,12 +74,9 @@ def paged_attention_v1(query,
     return attn_weights.squeeze(-2)
 
 
-def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
+def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     d = x.shape[-1] // 2
-    output_shape = (x.shape[:-1] + (d, ))
-    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-    silu_and_mul(out, x)
-    return out
+    return F.silu(x[..., :d]) * x[..., d:]
 
 
 def static_fused_moe(hidden_states, w1, w2, score, topk):
@@ -111,13 +101,10 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     htorch.core.mark_step()
 
     for expert_idx in range(num_experts):
-        padded_weight = padded_weights[expert_idx]
-        current_state_static = hidden_states.reshape(-1, D)
-        w_output = silu_and_mul_wrapper(
-            torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
+        w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1))
+        w_output = silu_and_mul(w_output)
         w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
-        current_hidden_states_static = w_output * padded_weight
-        final_hidden_states += current_hidden_states_static
+        final_hidden_states += w_output * padded_weights[expert_idx]
         htorch.core.mark_step()
 
     return final_hidden_states.view(-1, D)

From d29191000b11b960ca29b65d5876f05756d27ac0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 13 Aug 2024 12:14:31 +0200
Subject: [PATCH 122/341] Enable GitHub Actions static checks for habana_main
 (#177)

* Update target branch

* format layernorm.py

* adjust format.sh & vllm.hpu.ops

* fix layernorm once for all
---
 .github/workflows/clang-format.yml      |  6 +++---
 .github/workflows/mypy.yaml             |  8 +++++---
 .github/workflows/ruff.yml              |  6 +++---
 .github/workflows/yapf.yml              |  6 +++---
 format.sh                               |  1 +
 vllm/hpu/ops.py                         | 16 +++++++++++++---
 vllm/model_executor/layers/layernorm.py | 12 +-----------
 7 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index e9b6e28fa6bcb..9d40813a98d7a 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -2,13 +2,13 @@ name: clang-format
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the main branch
+  # but only for the habana_main branch
   push:
     branches:
-      - main
+      - habana_main
   pull_request:
     branches:
-      - main
+      - habana_main
 
 jobs:
   clang-format:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5780f09a646cb..c2674b914f485 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -2,13 +2,13 @@ name: mypy
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the main branch
+  # but only for the habana_main branch
   push:
     branches:
-      - main
+      - habana_main
   pull_request:
     branches:
-      - main
+      - habana_main
 
 jobs:
   ruff:
@@ -50,4 +50,6 @@ jobs:
         mypy vllm/transformers_utils --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/hpu --config-file pyproject.toml
+
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 773def58fd966..a2b7aa2549af9 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -2,13 +2,13 @@ name: ruff
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the main branch
+  # but only for the habana_main branch
   push:
     branches:
-      - main
+      - habana_main
   pull_request:
     branches:
-      - main
+      - habana_main
 
 jobs:
   ruff:
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 04f307bcf8b0e..4e0d67c5b59d6 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -2,13 +2,13 @@ name: yapf
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the main branch
+  # but only for the habana_main branch
   push:
     branches:
-      - main
+      - habana_main
   pull_request:
     branches:
-      - main
+      - habana_main
 jobs:
   yapf:
     runs-on: ubuntu-latest
diff --git a/format.sh b/format.sh
index 5ad6d6f2938bb..fbfc27a68bb3d 100755
--- a/format.sh
+++ b/format.sh
@@ -113,6 +113,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
+mypy vllm/hpu --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 3748eb3544dd1..7a40e6e720259 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -12,6 +12,16 @@
 import torch.nn.functional as F
 
 import vllm.hpu.utils as hpu_utils
+from vllm.logger import init_logger
+
+logger = init_logger()
+HPUFusedRMSNorm = None
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
+    HPUFusedRMSNorm = FusedRMSNorm
+except ImportError:
+    logger.warning("Could not import HPU FusedRMSNorm kernel. "
+                   "vLLM will use forward_native implementation of RMSNorm.")
 
 PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
 
@@ -52,8 +62,7 @@ def paged_attention_v1(query,
         keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
         mask = mask.unsqueeze(2)
 
-    attn_weights = [torch.matmul(query, k) for k in keys]
-    attn_weights = torch.cat(attn_weights, dim=-1)
+    attn_weights = torch.cat([torch.matmul(query, k) for k in keys], dim=-1)
     if alibi_slopes is not None:
         attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
                                        -attn_weights.size(3):])
@@ -128,7 +137,8 @@ def prompt_attention(
         query = query.unflatten(1, (kv_heads, -1))
         key = key.unflatten(1, (kv_heads, 1))
         value = value.unflatten(1, (kv_heads, 1))
-        attn_bias = attn_bias.unsqueeze(2)
+        if attn_bias is not None:
+            attn_bias = attn_bias.unsqueeze(2)
     attn_weights = torch.matmul(query * scale, key.transpose(-1, -2))
     if attn_bias is not None:
         attn_weights.add_(attn_bias)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 01429d2fcbd17..55cbbabd7da44 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,19 +6,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.utils import is_hpu
 
 logger = init_logger(__name__)
-if is_hpu():
-    try:
-        from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as
-                                                                HPUFusedRMSNorm
-                                                                )
-    except ImportError:
-        logger.warning(
-            "Could not import HPU FusedRMSNorm kernel. "
-            "vLLM will use forward_native implementation of RMSNorm.")
-        HPUFusedRMSNorm = None
 
 
 class RMSNorm(CustomOp):
@@ -86,6 +75,7 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm.hpu.ops import HPUFusedRMSNorm
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:

From 66eae9e75e6e70a69eeefbe24e8a1f0499524a3b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 13 Aug 2024 13:46:19 +0200
Subject: [PATCH 123/341] remove reminder_comment.yml (#179)

---
 .github/workflows/reminder_comment.yml | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 .github/workflows/reminder_comment.yml

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
deleted file mode 100644
index 390c88bb65308..0000000000000
--- a/.github/workflows/reminder_comment.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: PR Reminder Comment Bot
-on:
-  pull_request_target:
-    types: [opened]
-
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
-            })
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From c0984334c495762b10ee37dc817afad9fec0ef57 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 13 Aug 2024 13:46:33 +0200
Subject: [PATCH 124/341] Fix logger initialization in ops.py (#178)

---
 vllm/hpu/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 7a40e6e720259..c8f00c1cbd59d 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -14,7 +14,7 @@
 import vllm.hpu.utils as hpu_utils
 from vllm.logger import init_logger
 
-logger = init_logger()
+logger = init_logger(__name__)
 HPUFusedRMSNorm = None
 try:
     from habana_frameworks.torch.hpex.normalization import FusedRMSNorm

From 6f047d864ba3f7b409eeaedfd1e92f61389d31da Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 14 Aug 2024 14:53:48 +0200
Subject: [PATCH 125/341] 1.17 documentation update (#172)

---
 .../getting_started/gaudi-installation.rst    | 234 +++++++++++++++++-
 1 file changed, 230 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index a9f3ebdf274f6..7af291d62efc6 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -18,7 +18,7 @@ Requirements
 -  OS: Ubuntu 22.04 LTS
 -  Python: 3.10
 -  Intel Gaudi accelerator
--  Intel Gaudi software version 1.16.0 or newer
+-  Intel Gaudi software version 1.17.0
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -44,8 +44,8 @@ Use the following commands to run a Docker image:
 
 .. code:: console
 
-   $ docker pull vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+   $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 
 Build and Install vLLM
 ---------------------------
@@ -112,6 +112,12 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 -  `meta-llama/Meta-Llama-3-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct>`__
    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
    datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-8B <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-8B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct>`__
+   on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+   datatype with random or greedy sampling
 -  `meta-llama/Llama-2-70b <https://huggingface.co/meta-llama/Llama-2-70b>`__
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 -  `meta-llama/Llama-2-70b-chat-hf <https://huggingface.co/meta-llama/Llama-2-70b-chat-hf>`__
@@ -120,14 +126,187 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 -  `meta-llama/Meta-Llama-3-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct>`__
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-70B <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+-  `meta-llama/Meta-Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct>`__
+   with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 -  `mistralai/Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__
    on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
 -  `mistralai/Mixtral-8x7B-Instruct-v0.1 <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`__
    with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
 
-Performance Tips
+Performance Tuning
 ================
 
+Execution modes
+------------
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
+
+.. list-table:: vLLM execution modes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - ``PT_HPU_LAZY_MODE``
+     - ``enforce_eager`` 
+     - execution mode
+   * - 0
+     - 0
+     - torch.compile
+   * - 0
+     - 1
+     - PyTorch eager mode
+   * - 1
+     - 0
+     - HPU Graphs
+   * - 1
+     - 1
+     - PyTorch lazy mode
+
+.. warning::
+   In 1.17.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode.
+
+
+Bucketing mechanism
+------------
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
+
+.. note::
+   Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+
+Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+.. code-block::
+
+      INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+      INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+      INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+      INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+
+``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+.. code-block:: 
+   
+    min = 2, step = 32, max = 64
+    => ramp_up = (2, 4, 8, 16)
+    => stable = (32, 64)
+    => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+
+Example (without ramp-up)
+
+.. code-block:: 
+   
+    min = 128, step = 128, max = 512
+    => ramp_up = ()
+    => stable = (128, 256, 384, 512)
+    => buckets = ramp_up + stable => (128, 256, 384, 512)
+
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. 
+
+.. warning::
+   If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. 
+
+.. note::
+   Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+
+Warmup
+------------
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+.. code-block::
+
+   INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+   INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+   INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+   ...
+   INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+   INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+   ...
+   INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. 
+
+.. tip::
+   Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+
+HPU Graph capture
+------------
+
+`HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). 
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. 
+Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
+Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. 
+Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
+
+.. note:: 
+   ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.   
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+-    ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
+-    ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
+
+
+.. note::
+   ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+.. code-block::
+
+   INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+   INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+   INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+   INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+   INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+   INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+   ...
+   INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+   INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   ...
+   INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+   INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+   ...
+   INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+   INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+   INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+   INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+   INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+   INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+   INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+   INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+
+
+Recommended vLLM Parameters
+------------
+
 -  We recommend running inference on Gaudi 2 with ``block_size`` of 128
    for BF16 data type. Using default values (16, 32) might lead to
    sub-optimal performance due to Matrix Multiplication Engine
@@ -137,6 +316,53 @@ Performance Tips
    of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
    If you encounter out-of-memory issues, see troubleshooting section.
 
+Environment variables
+------------
+
+**Diagnostic and profiling knobs:**
+
+-   ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai <https://perfetto.habana.ai/#!/viewer>`__. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+-   ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+-   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
+-   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default
+-   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
+-   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
+-   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
+-   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+    - ``{phase}`` is either ``PROMPT`` or ``DECODE``
+    - ``{dim}`` is either ``BS`` or ``SEQ``
+    - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
+    - Default values:
+
+      - Prompt:
+         - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
+         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``32``
+         - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
+         - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
+         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``1024``
+
+      - Decode:
+         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
+         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``128``
+         - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
+         - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``block_size``
+         - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``2048``
+
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
+
+-   ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default 
+-   ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
+
 Troubleshooting: Tweaking HPU Graphs
 ====================================
 

From 1e0e492e1400114f9156d61ffdd73585181ed119 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 14 Aug 2024 15:06:19 +0200
Subject: [PATCH 126/341] Readme 1.17 update (#186)

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (*link existing issues this PR will resolve*)

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>
---
 README_GAUDI.md | 497 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 435 insertions(+), 62 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 1a1b2d9cc6e36..a569d6314acf8 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -1,25 +1,25 @@
-# vLLM with Intel® Gaudi® 2 AI Accelerators
+vLLM with Intel® Gaudi® AI Accelerators
+=======================================
 
-This README provides instructions on running vLLM with Intel Gaudi devices.
+This README provides instructions on running vLLM with Intel Gaudi
+devices.
 
 Requirements and Installation
-==============================
+=============================
 
-Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) 
-to set up the environment. To achieve the best performance, please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). 
-
-> [!NOTE]
-> In this release (1.16.0), we are only targeting functionality and
-> accuracy. Performance will be improved in next releases.
+Please follow the instructions provided in the [Gaudi Installation
+Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+to set up the environment. To achieve the best performance, please
+follow the methods outlined in the [Optimizing Training Platform
+Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 Requirements
--------------
+------------
 
 -   OS: Ubuntu 22.04 LTS
 -   Python: 3.10
--   Intel Gaudi 2 accelerator 
--   Intel Gaudi software version 1.16.0
+-   Intel Gaudi accelerator
+-   Intel Gaudi software version 1.17.0
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -29,41 +29,50 @@ $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, ha
 $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed
 ```
 
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details.
+Refer to [Intel Gaudi Software Stack
+Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
+for more details.
 
 Run Docker Image
-------------------
+----------------
 
-It is highly recommended to use the latest Docker image from Intel
-Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the [Intel Gaudi
+documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
+for more details.
 
 Use the following commands to run a Docker image:
 
 ``` {.console}
-$ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
- ```
+$ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+```
 
-Build and Install vLLM-fork
------------------------------
+Build and Install vLLM
+----------------------
 
-To build and install vLLM-fork from source, run:
+Currently, the latest features and performance optimizations are
+developed in Gaudi\'s [vLLM-fork](https://github.com/HabanaAI/vllm-fork)
+and we periodically upstream them to vLLM main repo. To install latest
+[HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the
+following:
 
 ``` {.console}
 $ git clone https://github.com/HabanaAI/vllm-fork.git
 $ cd vllm-fork
-# git checkout v0.4.2-Gaudi-1.16.0
-$ pip install -e .  # This may take 5-10 minutes.
+$ git checkout habana_main
+$ python setup.py develop
 ```
 
 Supported Features
 ==================
 
--   [Offline batched inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
--   Online inference via [OpenAI-Compatible Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+-   [Offline batched
+    inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
+-   Online inference via [OpenAI-Compatible
+    Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
 -   HPU autodetection - no need to manually select device within vLLM
--   Paged KV cache with algorithms enabled for Intel Gaudi 2
-    accelerators
+-   Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 -   Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
     prefill attention, Root Mean Square Layer Normalization, Rotary
     Positional Encoding
@@ -72,7 +81,6 @@ Supported Features
     Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
     for accelerating low-batch latency and throughput
 
-
 Unsupported Features
 ====================
 
@@ -82,11 +90,11 @@ Unsupported Features
 -   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
 -   Prefill chunking (mixed-batch inferencing)
 
-
 Supported Configurations
 ========================
 
-The following configurations have been validated to be function with Gaudi devices. Configurations that are not listed may or may not work.
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
 
 -   [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
     on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
@@ -94,47 +102,412 @@ The following configurations have been validated to be function with Gaudi devic
 -   [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
     on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
     datatype with random or greedy sampling
+-   [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
+-   [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
+-   [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
+-   [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+    datatype with random or greedy sampling
 -   [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
-    with tensor parallelism on 8x HPU, BF16 datatype with random
-    or greedy sampling
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
 -   [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
-    with tensor parallelism 8x HPU, BF16 datatype with random
-    or greedy sampling
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
+-   [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
+-   [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
+-   [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
+-   [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+    with tensor parallelism on 8x HPU, BF16 datatype with random or
+    greedy sampling
 -   [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
-    on single HPU or with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling
+    on single HPU or with tensor parallelism on 2x HPU, BF16 datatype
+    with random or greedy sampling
 -   [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
-    with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling
+    with tensor parallelism on 2x HPU, BF16 datatype with random or
+    greedy sampling
+
+Performance Tuning 
+================
 
+Execution modes 
+-----------------------------
 
+Currently in vLLM for HPU we support four execution modes, depending on
+selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment
+variable), and `--enforce-eager` flag.
 
-Performance Tips
-================
+| `PT_HPU_LAZY_MODE` 	| `enforce_eager` 	| execution mode |
+|---	|---	|---	|
+| 0 	| 0 	| torch.compile 	|
+| 0 	| 1 	| PyTorch eager mode 	|
+| 1 	| 0 	| HPU Graphs 	|
+| 1 	| 1 	| PyTorch lazy mode 	|
+
+
+> [!WARNING]
+> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly
+> experimental and should be only used for validating functional
+> correctness. Their performance will be improved in the next releases.
+> For obtaining the best performance in 1.17.0, please use HPU Graphs, or
+> PyTorch lazy mode.
+
+Bucketing mechanism 
+-----------------------------
+
+Intel Gaudi accelerators work best when operating on models with fixed
+tensor shapes. [Intel Gaudi Graph
+Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime)
+is responsible for generating optimized binary code that implements the
+given model topology on Gaudi. In its default configuration, the
+produced binary code may be heavily dependent on input and output tensor
+shapes, and can require graph recompilation when encountering
+differently shaped tensors within the same topology. While the resulting
+binaries utilize Gaudi efficiently, the compilation itself may introduce
+a noticeable overhead in end-to-end execution. In a dynamic inference
+serving scenario, there is a need to minimize the number of graph
+compilations and reduce the risk of graph compilation occurring during
+server runtime. Currently it is achieved by \"bucketing\" model\'s
+forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+> [!NOTE] 
+> Bucketing allows us to reduce the number of required graphs
+> significantly, but it does not handle any graph compilation and device
+> code generation - this is done in warmup and HPUGraph capture phase.
+
+Bucketing ranges are determined with 3 parameters - `min`, `step` and
+`max`. They can be set separately for prompt and decode phase, and for
+batch size and sequence length dimension. These parameters can be
+observed in logs during vLLM startup:
+
+``` {.}
+INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+```
 
--   We recommend running inference on Gaudi 2 with
-    `block_size` of 128 for BF16 data type. Using default
-    values (16, 32) might lead to sub-optimal performance due to Matrix
-    Multiplication Engine under-utilization (see [Gaudi
+`min` determines the lowest value of the bucket. `step` determines the
+interval between buckets, and `max` determines the upper bound of the
+bucket. Furthermore, interval between `min` and `step` has special
+handling - `min` gets multiplied by consecutive powers of two, until
+`step` gets reached. We call this the ramp-up phase and it is used for
+handling lower batch sizes with minimum wastage, while allowing larger
+padding on larger batch sizes.
+
+Example (with ramp-up)
+
+``` {.}
+min = 2, step = 32, max = 64
+=> ramp_up = (2, 4, 8, 16)
+=> stable = (32, 64)
+=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+```
+
+Example (without ramp-up)
+
+``` {.}
+min = 128, step = 128, max = 512
+=> ramp_up = ()
+=> stable = (128, 256, 384, 512)
+=> buckets = ramp_up + stable => (128, 256, 384, 512)
+```
+
+In the logged scenario, 24 buckets were generated for prompt (prefill)
+runs, and 48 buckets for decode runs. Each bucket corresponds to a
+separate optimized device binary for a given model with specified tensor
+shapes. Whenever a batch of requests is processed, it is padded across
+batch and sequence length dimension to the smallest possible bucket.
+
+> [!WARNING]
+> If a request exceeds maximum bucket size in any dimension, it will be
+> processed without padding, and its processing may require a graph
+> compilation, potentially significantly increasing end-to-end latency.
+> The boundaries of the buckets are user-configurable via environment
+> variables, and upper bucket boundaries can be increased to avoid such
+> scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of
+412 comes in to an idle vLLM server, it will be padded executed as
+`(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be
+padded to 4 (closest batch\_size dimension higher than 3), and max
+sequence length will be padded to 512 (closest sequence length dimension
+higher than 412). After prefill stage, it will be executed as `(4, 512)`
+decode bucket and will continue as that bucket until either batch
+dimension changes (due to request being finished) - in which case it
+will become a `(2, 512)` bucket, or context length increases above 512
+tokens, in which case it will become `(4, 640)` bucket.
+
+> [!NOTE]
+> Bucketing is transparent to a client - padding in sequence length
+> dimension is never returned to the client, and padding in batch
+> dimension does not create new requests.
+
+Warmup
+------
+
+Warmup is an optional, but highly recommended step occurring before vLLM
+server starts listening. It executes a forward pass for each bucket with
+dummy data. The goal is to pre-compile all graphs and not incur any
+graph compilation overheads within bucket boundaries during server
+runtime. Each warmup step is logged during vLLM startup:
+
+``` {.}
+INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+...
+INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+...
+INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+```
+
+This example uses the same buckets as in *Bucketing mechanism* section.
+Each output line corresponds to execution of a single bucket. When
+bucket is executed for the first time, its graph is compiled and can be
+reused later on, skipping further graph compilations.
+
+> [!TIP]
+> Compiling all the buckets might take some time and can be turned off
+> with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if
+> you do that, you may face graph compilations once executing a given
+> bucket for the first time. It is fine to disable warmup for development,
+> but it\'s highly recommended to enable it in deployment.
+
+HPU Graph capture
+-----------------------------
+
+[HPU
+Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
+are currently the most performant execution method of vLLM on Intel
+Gaudi. When HPU Graphs are enabled, execution graphs will be traced
+(recorded) ahead of time (after performing warmup), to be later replayed
+during inference, significantly reducing host overheads. Recording can
+take large amounts of memory, which needs to be taken into account when
+allocating KV cache. Enabling HPU Graphs will impact the number of
+available KV cache blocks, but vLLM provides user-configurable variables
+to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool
+(\"usable memory\") as KV cache, determined by `gpu_memory_utilization`
+flag (`0.9` by default). Before KV cache gets allocated, model weights
+are loaded onto the device, and a forward pass of the model is executed
+on dummy data, to estimate memory usage. Only after that,
+`gpu_memory_utilization` flag is utilized - at its default value, will
+mark 90% of free device memory at that point as usable. Next, KV cache
+gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
+memory reserved for HPU Graphs capture. With its default value
+(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved
+for graph capture (later referred to as \"usable graph memory\"), and
+the remaining 60% will be utilized for KV cache. Environment variable
+`VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
+reserved for prefill and decode graphs. By default
+(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
+constraints. Lower value corresponds to less usable graph memory
+reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will
+reserve 20% of usable graph memory for prefill graphs, and 80% of usable
+graph memory for decode graphs.
+
+> [!NOTE]
+> `gpu_memory_utilization` does not correspond to the absolute memory
+> usage across HPU. It specifies the memory margin after loading the model
+> and performing a profile run. If device has 100 GiB of total memory, and
+> 50 GiB of free memory after loading model weights and executing
+> profiling run, `gpu_memory_utilization` at its default value will mark
+> 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total
+> device memory.
+
+User can also configure the strategy for capturing HPU Graphs for prompt
+and decode stages separately. Strategy affects the order of capturing
+graphs. There are two strategies implemented: - `max_bs` - graph capture
+queue will sorted in descending order by their batch sizes. Buckets with
+equal batch sizes are sorted by sequence length in ascending order (e.g.
+`(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`,
+`(1,256)`), default strategy for decode - `min_tokens` - graph capture
+queue will be sorted in ascending order by the number of tokens each
+graph processes (`batch_size*sequence_length`), default strategy for
+prompt
+
+When there\'s large amount of requests pending, vLLM scheduler will
+attempt to fill the maximum batch size for decode as soon as possible.
+When a request is finished, decode batch size decreases. When that
+happens, vLLM will attempt to schedule a prefill iteration for requests
+in the waiting queue, to fill the decode batch size to its previous
+state. This means that in a full load scenario, decode batch size is
+often at its maximum, which makes large batch size HPU Graphs crucial to
+capture, as reflected by `max_bs` strategy. On the other hand, prefills
+will be executed most frequently with very low batch sizes (1-4), which
+is reflected in `min_tokens` strategy.
+
+> [!NOTE]
+> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by
+> graphs for each stage (prefill and decode). vLLM will first attempt to
+> use up entirety of usable prefill graph memory (usable graph memory \*
+> `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it
+> will attempt do the same for decode graphs and usable decode graph
+> memory pool. If one stage is fully captured, and there is unused memory
+> left within usable graph memory pool, vLLM will attempt further graph
+> capture for the other stage, until no more HPU Graphs can be captured
+> without exceeding reserved memory pool. The behavior on that mechanism
+> can be observed in the example below.
+
+Each described step is logged by vLLM server, as follows (negative
+values correspond to memory being released):
+
+``` {.}
+INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+...
+INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+...
+INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+```
+
+Recommended vLLM Parameters
+-----------------------------
+
+-   We recommend running inference on Gaudi 2 with `block_size` of 128
+    for BF16 data type. Using default values (16, 32) might lead to
+    sub-optimal performance due to Matrix Multiplication Engine
+    under-utilization (see [Gaudi
     Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
 -   For max throughput on Llama 7B, we recommend running with batch size
-    of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-    If you encounter out-of-memory issues, see troubleshooting section.
+    of 128 or 256 and max context length of 2048 with HPU Graphs
+    enabled. If you encounter out-of-memory issues, see troubleshooting
+    section.
+
+Environment variables
+-----------------------------
+
+**Diagnostic and profiling knobs:**
+
+-   `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be
+    enabled. Resulting JSON traces can be viewed in
+    [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled
+    by default.
+-   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph
+    compilations per each vLLM engine step, only when there was any -
+    highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`.
+    Disabled by default.
+-   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph
+    compilations per each vLLM engine step, always, even if there were
+    none. Disabled by default.
+-   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks
+    per each vLLM engine step, only when there was any. Disabled by
+    default.
+-   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu
+    fallbacks per each vLLM engine step, always, even if there were
+    none. Disabled by default.
+
+**Performance tuning knobs:**
+
+-   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
+    default
+-   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
+    HPUGraph capture, `0.4` by default
+-   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
+    dedicated for prompt graphs, `0.5` by default
+-   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
+    graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+-   `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode
+    graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+-   `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment
+    variables configuring ranges of bucketing mechanism
+    -   `{phase}` is either `PROMPT` or `DECODE`
+    -   `{dim}` is either `BS` or `SEQ`
+    -   `{param}` is either `MIN`, `STEP` or `MAX`
+    -   Default values:
+        - Prompt:
+           -   batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+           -   batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32`
+           -   batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`):
+                    `min(max_num_seqs, 64)`
+           -   sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`):
+                    `block_size`
+           -   sequence length step
+                    (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+           -   sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`):
+                    `1024`
+
+        - Decode:
+            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+            -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
+                    `128`
+            -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
+                    `max_num_seqs`
+            -   sequence length min (`VLLM_DECODE_SEQ_BUCKET_MIN`):
+                    `block_size`
+            -   sequence length step
+                    (`VLLM_DECODE_SEQ_BUCKET_STEP`): `block_size`
+            -   sequence length max (`VLLM_DECODE_SEQ_BUCKET_MAX`):
+                    `2048`
+
+Additionally, there are HPU PyTorch Bridge environment variables
+impacting vLLM execution:
+
+-   `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be
+    used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is
+    default
+-   `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor
+    parallel inference with HPU Graphs
 
 Troubleshooting: Tweaking HPU Graphs
 ====================================
 
-If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: 
-
--  Tweak `gpu_memory_utilization` knob. It
-   will decrease the allocation of KV cache, leaving some headroom for
-   capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9.
-   It attempts to allocate \~90% of HBM left for KV cache after short
-   profiling run. Note that decreasing reduces the number of KV
-   cache blocks you have available, and therefore reduces the effective
-   maximum number of tokens you can handle at a given time.
-
--  If this method is not efficient, you can disable `HPUGraph` completely. With
-   HPU Graphs disabled, you are trading latency and throughput at lower
-   batches for potentially higher throughput on higher batches. You can do
-   that by adding `--enforce-eager` flag to server (for
-   online inference), or by passing `enforce_eager=True`
-   argument to LLM constructor (for offline inference).
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+-   Tweak `gpu_memory_utilization` knob. It will decrease the allocation
+    of KV cache, leaving some headroom for capturing graphs with larger
+    batch size. By default `gpu_memory_utilization` is set to 0.9. It
+    attempts to allocate \~90% of HBM left for KV cache after short
+    profiling run. Note that decreasing reduces the number of KV cache
+    blocks you have available, and therefore reduces the effective
+    maximum number of tokens you can handle at a given time.
+-   If this method is not efficient, you can disable `HPUGraph`
+    completely. With HPU Graphs disabled, you are trading latency and
+    throughput at lower batches for potentially higher throughput on
+    higher batches. You can do that by adding `--enforce-eager` flag to
+    server (for online inference), or by passing `enforce_eager=True`
+    argument to LLM constructor (for offline inference).

From b0112c3a9a075e83f5bb98127586d925402f3614 Mon Sep 17 00:00:00 2001
From: Nir David <124874956+nirda7@users.noreply.github.com>
Date: Wed, 14 Aug 2024 19:34:25 +0300
Subject: [PATCH 127/341] Support FP8 INC in vLLM (#144)

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (*link existing issues this PR will resolve*)

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>
---
 README_GAUDI.md                               |   3 +-
 .../getting_started/gaudi-installation.rst    |   3 +-
 vllm/attention/backends/habana_attn.py        |  26 +++-
 vllm/attention/ops/habana_paged_attn.py       |  10 ++
 vllm/config.py                                |   8 +-
 vllm/engine/arg_utils.py                      |  14 ++-
 vllm/engine/llm_engine.py                     |   6 +-
 vllm/entrypoints/llm.py                       |   3 +
 vllm/executor/habana_executor.py              |   9 ++
 vllm/executor/ray_habana_executor.py          |   3 +
 vllm/hpu/cache_ops.py                         |  31 +++++
 vllm/hpu/ops.py                               |  33 +++--
 vllm/hpu/utils.py                             |  40 ++++++
 vllm/model_executor/layers/layernorm.py       |  11 +-
 vllm/model_executor/layers/linear.py          |  10 +-
 .../layers/quantization/__init__.py           |   2 +
 .../model_executor/layers/quantization/inc.py | 115 ++++++++++++++++++
 vllm/model_executor/model_loader/loader.py    |  22 ++--
 vllm/model_executor/models/llama.py           |   6 +
 vllm/utils.py                                 |   1 +
 vllm/worker/cache_engine.py                   |   4 +-
 vllm/worker/habana_model_runner.py            |  57 ++++++++-
 vllm/worker/habana_worker.py                  |  21 ++++
 23 files changed, 387 insertions(+), 51 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/inc.py

diff --git a/README_GAUDI.md b/README_GAUDI.md
index a569d6314acf8..9ea30a2e43f69 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -26,7 +26,8 @@ To verify that the Intel Gaudi software was correctly installed, run:
 ``` {.console}
 $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
 $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
-$ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed
+$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+$ pip list | grep neural # verify that neural-compressor is installed
 ```
 
 Refer to [Intel Gaudi Software Stack
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 7af291d62efc6..ddbac022a8d9d 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -26,7 +26,8 @@ To verify that the Intel Gaudi software was correctly installed, run:
 
    $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
    $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
-   $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed
+   $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+   $ pip list | grep neural # verify that neural_compressor is installed
 
 Refer to `Intel Gaudi Software Stack
 Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 33b6e2e538b13..7a867e79b203d 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -12,6 +12,8 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
+from vllm.hpu import cache_ops
+from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -108,7 +110,7 @@ def __post_init__(self):
         self.attn_bias: Optional[torch.Tensor] = None
 
 
-class HabanaAttentionImpl(AttentionImpl):
+class HabanaAttentionImpl(AttentionImpl, torch.nn.Module):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
     |<--------------- num_prefill_tokens ----------------->|
@@ -137,10 +139,16 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         max_seq_len: int = 4096,
     ) -> None:
+        super(AttentionImpl, self).__init__()
         self.kv_cache_dtype = kv_cache_dtype
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
+        self.matmul_qk = Matmul()
+        self.softmax = Softmax()
+        self.matmul_av = Matmul()
+        self.k_cache = VLLMKVCache()
+        self.v_cache = VLLMKVCache()
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
         self.position_bias = None
@@ -204,9 +212,13 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            HabanaPagedAttention.write_to_paged_cache(
-                key, value, key_cache, value_cache, attn_metadata.slot_mapping,
-                self.kv_cache_dtype, attn_metadata.is_prompt)
+            num_kv_cache_passes, num_slots_available, indices, offsets = \
+                cache_ops.prepare_to_cache(key_cache,
+                                           attn_metadata.slot_mapping)
+            key_cache = self.k_cache(key, key_cache, num_kv_cache_passes,
+                                     num_slots_available, indices, offsets)
+            value_cache = self.v_cache(value, value_cache, num_kv_cache_passes,
+                                       num_slots_available, indices, offsets)
 
         if attn_metadata.is_prompt:
             # Prompt run.
@@ -232,6 +244,9 @@ def forward(
                     attn_bias=attn_bias,
                     p=0.0,
                     scale=self.scale,
+                    matmul_qk_op=self.matmul_qk,
+                    softmax_op=self.softmax,
+                    matmul_av_op=self.matmul_av,
                 )
                 output = out.reshape(batch_size, seq_len, hidden_size)
             else:
@@ -255,7 +270,8 @@ def forward(
                 query, key_cache, value_cache, attn_metadata.block_tables,
                 attn_metadata.seq_lens_tensor, self.kv_cache_dtype,
                 self.num_kv_heads, self.scale, self.position_bias, k_scale,
-                v_scale)
+                v_scale, self.matmul_qk, self.softmax, self.matmul_av,
+                self.k_cache, self.v_cache)
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index 7dd701c7a0cdf..9602886299c47 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -75,6 +75,11 @@ def forward_decode(
         alibi_slopes: Optional[torch.Tensor],
         k_scale: float,
         v_scale: float,
+        matmul_qk_op,
+        softmax_op,
+        matmul_av_op,
+        k_cache_cls,
+        v_cache_cls,
     ) -> torch.Tensor:
         block_size = value_cache.shape[1]
         return ops.paged_attention_v1(
@@ -88,6 +93,11 @@ def forward_decode(
             block_size,
             alibi_slopes,
             kv_cache_dtype,
+            matmul_qk_op,
+            softmax_op,
+            matmul_av_op,
+            k_cache_cls,
+            v_cache_cls,
         )
 
     @staticmethod
diff --git a/vllm/config.py b/vllm/config.py
index f16bea16fe646..6acb70ad047b2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -474,12 +474,13 @@ def _verify_args(self) -> None:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
                 "Meanwhile, it may cause accuracy drop without a proper "
-                "scaling factor")
+                "scaling factor. "
+                "Intel Gaudi (HPU) supports fp8 (using fp8_inc).")
         else:
             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
 
@@ -600,11 +601,12 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
-            
+        device: Device on which weights are loaded.
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
     download_dir: Optional[str] = None
+    device: Optional[str] = None
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e4b223a1b505f..d6c544750afea 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -38,6 +38,7 @@ class EngineArgs:
     trust_remote_code: bool = False
     download_dir: Optional[str] = None
     load_format: str = 'auto'
+    weights_load_device: Optional[str] = None
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
     quantization_param_path: Optional[str] = None
@@ -205,6 +206,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'section for more information.\n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
+        parser.add_argument("--weights-load-device",
+                            type=str,
+                            default=EngineArgs.weights_load_device,
+                            choices=["cuda", "neuron", "hpu", "cpu"],
+                            help='Device on which weights are loaded.')
         parser.add_argument(
             '--dtype',
             type=str,
@@ -223,11 +229,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--kv-cache-dtype',
             type=str,
-            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
             default=EngineArgs.kv_cache_dtype,
             help='Data type for kv cache storage. If "auto", will use model '
             'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3). '
+            'Intel Gaudi (HPU) supports fp8 (using fp8_inc).')
         parser.add_argument(
             '--quantization-param-path',
             type=nullable_str,
@@ -835,9 +842,12 @@ def create_engine_config(self, ) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
+        device = device_config.device if self.weights_load_device is None else \
+                 self.weights_load_device
         load_config = LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
+            device=device,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
         )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3f7e0a7a4dc53..f8b9c48bc9589 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -182,7 +182,7 @@ def __init__(
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
             "pipeline_parallel_size=%d, "
             "disable_custom_all_reduce=%s, quantization=%s, "
-            "enforce_eager=%s, kv_cache_dtype=%s, "
+            "weights_load_device=%s, enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
             "decoding_config=%r, observability_config=%r, "
             "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
@@ -206,6 +206,7 @@ def __init__(
             parallel_config.pipeline_parallel_size,
             parallel_config.disable_custom_all_reduce,
             model_config.quantization,
+            load_config.device,
             model_config.enforce_eager,
             cache_config.cache_dtype,
             model_config.quantization_param_path,
@@ -853,6 +854,9 @@ def _process_model_outputs(
             request_outputs.append(request_output)
         return request_outputs
 
+    def finish_measurements(self):
+        self.model_executor.finish_measurements()
+
     def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 62309ed345b1d..fc9f118ff14b2 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -173,6 +173,9 @@ def set_tokenizer(
             self.llm_engine.tokenizer.tokenizer = get_cached_tokenizer(
                 tokenizer)
 
+    def finish_measurements(self):
+        self.llm_engine.finish_measurements()
+
     @overload  # LEGACY: single (prompt + optional token ids)
     def generate(
         self,
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index f5cf26b687053..80f8037a2d043 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -90,6 +90,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
         logger.info(msg)
 
+    def finish_measurements(self):
+        self.driver_worker.finish_measurements()
+
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
@@ -180,6 +183,12 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def shutdown(self) -> None:
+        self.driver_worker.shutdown_inc()
+
+    def __del__(self):
+        self.shutdown()
+
 
 class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
 
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 9e0a89cbeb8aa..17e3414a96b57 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -237,6 +237,9 @@ def _driver_execute_model(
         return self.driver_worker.execute_method("execute_model",
                                                  execute_model_req)
 
+    def finish_measurements(self):
+        self._run_workers("finish_measurements")
+
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 14824945aa53a..98f109accea06 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -43,6 +43,37 @@ def reshape_and_cache(key,
             value[start_idx:end_idx])
 
 
+def prepare_to_cache(cache, slot_mapping):
+    num_blocks = cache.size(0)
+    block_size = cache.size(1)
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    offsets = torch.fmod(slot_mapping, block_size)
+    num_slots_requested = slot_mapping.size(0)
+    num_slots_available = num_blocks * block_size
+    # NOTE(kzawora): HPU PT bridge crashes with
+    # RuntimeError: Invalid inputs for scatter_nd_onnx
+    # on index_put when num_slots_requested > num_slots_available.
+    # This case might occur when we have little kv cache blocks and
+    # lots of padding, or are doing warmup.
+    # This loop is a workaround for this issue. Please remove it
+    # once key_cache.index_put_(indices, offsets), key) works.
+    num_kv_cache_passes = torch.div(num_slots_requested,
+                                    num_slots_available).ceil().int().item()
+
+    return num_kv_cache_passes, num_slots_available, indices, offsets
+
+
+def insert_or_update_cache(input, cache, num_kv_cache_passes,
+                           num_slots_available, block_indices, block_offsets):
+    for i in range(num_kv_cache_passes):
+        start_idx = i * num_slots_available
+        end_idx = (i + 1) * num_slots_available
+        cache.index_put_((block_indices[start_idx:end_idx],
+                          block_offsets[start_idx:end_idx]),
+                         input[start_idx:end_idx])
+
+
 def swap_blocks(src, dst, block_mapping):
     index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
     index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index c8f00c1cbd59d..23f6964723d3f 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -11,7 +11,6 @@
 import torch
 import torch.nn.functional as F
 
-import vllm.hpu.utils as hpu_utils
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -33,7 +32,6 @@ def fetch_from_cache(cache, blocks, permutations):
     ]
 
 
-@hpu_utils.with_mark_steps
 def paged_attention_v1(query,
                        key_cache,
                        value_cache,
@@ -43,7 +41,12 @@ def paged_attention_v1(query,
                        context_lens,
                        block_size,
                        alibi_slopes=None,
-                       kv_cache_dtype=None) -> None:
+                       kv_cache_dtype=None,
+                       matmul_qk_op=torch.matmul,
+                       softmax_op=torch.softmax,
+                       matmul_av_op=torch.matmul,
+                       k_cache_cls=None,
+                       v_cache_cls=None) -> None:
     seq_len = block_tables.size(1)
     batch_size, query_heads, _ = query.shape
     _, _, kv_heads, _ = key_cache.shape
@@ -56,19 +59,23 @@ def paged_attention_v1(query,
                                  batch_size, 1, 1, -1))
     query.mul_(scale)
     query = query.unsqueeze(-2)
-    keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1))
+    fetch_keys = fetch_from_cache if k_cache_cls is None else \
+                 k_cache_cls.fetch_from_cache
+    keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1))
     if query_heads != kv_heads:
         query = query.unflatten(1, (kv_heads, -1))
         keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
         mask = mask.unsqueeze(2)
 
-    attn_weights = torch.cat([torch.matmul(query, k) for k in keys], dim=-1)
+    attn_weights = torch.cat([matmul_qk_op(query, k) for k in keys], dim=-1)
     if alibi_slopes is not None:
         attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
                                        -attn_weights.size(3):])
-    attn_weights = (attn_weights.masked_fill(mask, min_inf).softmax(dim=-1))
+    attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1)
 
-    values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3))
+    fetch_values = fetch_from_cache if v_cache_cls is None else \
+                   v_cache_cls.fetch_from_cache
+    values = fetch_values(value_cache, block_tables, (0, 2, 1, 3))
     if PA_SPLIT_VALUE:
         attn_weights = attn_weights.split(block_size, dim=-1)
     else:
@@ -76,7 +83,7 @@ def paged_attention_v1(query,
         attn_weights = [attn_weights]
     if query_heads != kv_heads:
         values = [v.unflatten(1, (kv_heads, 1)) for v in values]
-    attn_weights = [torch.matmul(a, v) for a, v in zip(attn_weights, values)]
+    attn_weights = [matmul_av_op(a, v) for a, v in zip(attn_weights, values)]
     if query_heads != kv_heads:
         attn_weights = [a.flatten(1, 2) for a in attn_weights]
     attn_weights = sum(attn_weights)
@@ -119,7 +126,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     return final_hidden_states.view(-1, D)
 
 
-@hpu_utils.with_mark_steps
 def prompt_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -127,6 +133,9 @@ def prompt_attention(
     attn_bias: Optional[torch.Tensor] = None,
     p: float = 0.0,
     scale: Optional[float] = None,
+    matmul_qk_op=torch.matmul,
+    softmax_op=torch.softmax,
+    matmul_av_op=torch.matmul,
 ) -> torch.Tensor:
     query = query.transpose(1, 2)
     key = key.transpose(1, 2)
@@ -139,11 +148,11 @@ def prompt_attention(
         value = value.unflatten(1, (kv_heads, 1))
         if attn_bias is not None:
             attn_bias = attn_bias.unsqueeze(2)
-    attn_weights = torch.matmul(query * scale, key.transpose(-1, -2))
+    attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2))
     if attn_bias is not None:
         attn_weights.add_(attn_bias)
-    attn_weights = torch.softmax(attn_weights, dim=-1)
-    attn_weights = torch.matmul(attn_weights, value)
+    attn_weights = softmax_op(attn_weights, dim=-1)
+    attn_weights = matmul_av_op(attn_weights, value)
     if query_heads != kv_heads:
         attn_weights = attn_weights.flatten(1, 2)
     attn_weights = attn_weights.transpose(1, 2)
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
index b7b435c50c295..3d9c7cb1c4c22 100644
--- a/vllm/hpu/utils.py
+++ b/vllm/hpu/utils.py
@@ -8,6 +8,9 @@
 from functools import wraps
 
 import habana_frameworks.torch as htorch
+import torch
+
+from vllm.hpu.cache_ops import insert_or_update_cache
 
 
 def with_mark_steps(fn):
@@ -22,3 +25,40 @@ def wrapped(*args, **kwargs):
         return result
 
     return wrapped
+
+
+class Matmul(torch.nn.Module):
+
+    def __init__(self):
+        super(Matmul, self).__init__()
+
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class Softmax(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim=None, inv_head=None):
+        return torch.softmax(x, dim)
+
+
+class VLLMKVCache(torch.nn.Module):
+
+    def __init__(self):
+        super(VLLMKVCache, self).__init__()
+
+    def forward(self, input, cache, num_kv_cache_passes, num_slots_available,
+                block_indices, block_offset):
+        insert_or_update_cache(input, cache, num_kv_cache_passes,
+                               num_slots_available, block_indices,
+                               block_offset)
+        return cache
+
+    def fetch_from_cache(self, cache, blocks, permutations):
+        return [
+            cache.index_select(0, blocks[:, i]).permute(permutations)
+            for i in range(blocks.size(1))
+        ]
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 55cbbabd7da44..c12668c14887d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -79,18 +79,15 @@ def forward_hpu(
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:
-            orig_dtype = x.dtype
             orig_shape = x.shape
             residual += x.view(residual.shape)
             # Note: HPUFusedRMSNorm requires 3D tensors as inputs
-            x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(),
+            x = HPUFusedRMSNorm.apply(residual, self.weight,
                                       self.variance_epsilon)
-            return x.to(orig_dtype).view(orig_shape), residual
+            return x.view(orig_shape), residual
 
-        orig_dtype = x.dtype
-        x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(),
-                                  self.variance_epsilon)
-        return x.to(orig_dtype)
+        x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
+        return x
 
     def forward_xpu(
         self,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index b6e280ae65049..10c8a95f838da 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -273,6 +273,7 @@ def __init__(self,
                          quant_config, prefix)
 
         self.gather_output = gather_output
+        self.collective_func = tensor_model_parallel_all_gather
 
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
@@ -334,7 +335,7 @@ def forward(self, input_):
         output_parallel = self.quant_method.apply(self, input_, bias)
         if self.gather_output:
             # All-gather across the partitions.
-            output = tensor_model_parallel_all_gather(output_parallel)
+            output = self.collective_func(output_parallel)
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
@@ -723,6 +724,7 @@ def __init__(self,
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
+        self.collective_func = tensor_model_parallel_all_reduce
 
         # Divide the weight matrix along the last dimension.
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -770,7 +772,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def forward(self, input_):
+    def resolve_input(self, input_):
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -778,6 +780,10 @@ def forward(self, input_):
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size)
             input_parallel = splitted_input[tp_rank].contiguous()
+        return input_parallel
+
+    def forward(self, input_):
+        input_parallel = self.resolve_input(input_)
 
         # Matrix multiply.
         assert self.quant_method is not None
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index bd574512e3431..7590d3e980275 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -18,6 +18,7 @@
     GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
+from vllm.model_executor.layers.quantization.inc import INCConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 
@@ -37,6 +38,7 @@
     "squeezellm": SqueezeLLMConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
+    "inc": INCConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
new file mode 100644
index 0000000000000..f6718ec2ac9e7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -0,0 +1,115 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+class INCConfig(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "inc"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "INCConfig":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["INCLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return INCLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # The AWQ kernel only supports Turing or newer GPUs.
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return []
+
+
+class INCLinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+       
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self,
+                 quant_config: INCConfig,
+                 separate_bias_add: bool = False):
+        self.separate_bias_add = separate_bias_add
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = layer.weight
+        if self.separate_bias_add:
+            if bias is not None:
+                return F.linear(x, weight) + bias
+            return F.linear(x, weight)
+        return F.linear(x, weight, bias)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index bbe49655020da..06048d97088e1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -37,7 +37,7 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_tpu
+from vllm.utils import is_hpu, is_tpu
 
 logger = init_logger(__name__)
 
@@ -48,14 +48,15 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability = current_platform.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        if capability < quant_config.get_min_capability():
-            raise ValueError(
-                f"The quantization method {model_config.quantization} is not "
-                "supported for the current GPU. "
-                f"Minimum capability: {quant_config.get_min_capability()}. "
-                f"Current capability: {capability}.")
+        if not is_hpu():
+            capability = current_platform.get_device_capability()
+            capability = capability[0] * 10 + capability[1]
+            if capability < quant_config.get_min_capability():
+                raise ValueError(
+                    f"The quantization method {model_config.quantization} "
+                    "is not supported for the current GPU. "
+                    f"Minimum capability: {quant_config.get_min_capability()}. "
+                    f"Current capability: {capability}.")
         supported_dtypes = quant_config.get_supported_act_dtypes()
         if model_config.dtype not in supported_dtypes:
             raise ValueError(
@@ -276,10 +277,11 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with torch.device(self.load_config.device):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config, scheduler_config)
+            logger.info("Loading weights on %s ...", self.load_config.device)
             model.load_weights(
                 self._get_weights_iterator(model_config.model,
                                            model_config.revision,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 306d22e42ed1d..676a51ce67f96 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -48,6 +48,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.utils import is_hip
 
@@ -317,6 +318,9 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
+        if current_platform.is_hpu():
+            import habana_frameworks.torch as htorch
+            htorch.core.mark_step()
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(
@@ -326,6 +330,8 @@ def forward(
                 attn_metadata,
                 residual,
             )
+            if current_platform.is_hpu():
+                htorch.core.mark_step()
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/utils.py b/vllm/utils.py
index 8a1bc5de03eb7..fe84253feb172 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -39,6 +39,7 @@
     "fp8": torch.uint8,
     "fp8_e4m3": torch.uint8,
     "fp8_e5m2": torch.uint8,
+    "fp8_inc": torch.float8_e4m3fn,
 }
 
 TORCH_DTYPE_TO_NUMPY_DTYPE = {
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 93be2f4c321fe..ec0b8c2369210 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -91,9 +91,11 @@ def _allocate_kv_cache(
                 # null block in CpuGpuBlockAllocator requires at least that
                 # block to be zeroed-out.
                 # We zero-out everything for simplicity.
+                dtype = torch.uint8 if self.dtype == torch.float8_e4m3fn else \
+                        self.dtype
                 kv_cache.append(
                     torch.zeros(kv_cache_shape,
-                                dtype=self.dtype,
+                                dtype=dtype,
                                 pin_memory=pin_memory,
                                 device=device))
         return kv_cache
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index cf91c69069ed6..72aba42ae8553 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -182,8 +182,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
         selected_token_indices = kwargs.pop('selected_token_indices')
-        if 'bypass_hpu_graphs' in kwargs:
-            kwargs.pop('bypass_hpu_graphs')  # required for PT eager
+        if 'warmup_mode' in kwargs:
+            kwargs.pop('warmup_mode')
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'],
                                                       input_ids.size(0),
@@ -413,6 +413,9 @@ def __init__(
         self._setup_buckets()
 
     def load_model(self) -> None:
+        import habana_frameworks.torch.core as htcore
+        if self.model_config.quantization == 'inc':
+            htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(
@@ -429,6 +432,26 @@ def load_model(self) -> None:
                    f"took {m_getmodel.get_summary_string()}")
             logger.info(msg)
 
+            if self.model_config.quantization == 'inc':
+                logger.info("Preparing model with INC..")
+                with HabanaMemoryProfiler() as m_inc:
+                    from neural_compressor.torch.quantization import (
+                        FP8Config, convert, prepare)
+                    config = FP8Config.from_json_file(
+                        os.getenv("QUANT_CONFIG", ""))
+                    if config.measure:
+                        self.model = prepare(self.model, config)
+                    elif config.quantize:
+                        self.model = convert(self.model, config)
+                    htcore.hpu_initialize(self.model,
+                                          mark_only_scales_as_const=True)
+                logger.info("Preparing model with INC took %s",
+                            m_inc.get_summary_string())
+            else:
+                self.model = self.model.to("hpu")
+                htcore.mark_step()
+            torch.hpu.synchronize()
+
             # FIXME: Running with disable_tensor_cache=True causes
             # RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
@@ -1051,7 +1074,7 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
         torch.hpu.synchronize()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, kv_caches)
+            self.execute_model(inputs, kv_caches, warmup_mode=True)
             torch.hpu.synchronize()
         self.profiler.end()
         gc.collect()
@@ -1362,6 +1385,10 @@ def prepare_model_input(
                                    is_prompt=is_prompt,
                                    virtual_engine=virtual_engine)
 
+    def finish_measurements(self):
+        from neural_compressor.torch.quantization import finalize_calibration
+        finalize_calibration(self.model.model)
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1369,6 +1396,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        warmup_mode=False,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError(
@@ -1402,6 +1430,11 @@ def execute_model(
         }
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
+        if htorch.utils.internal.is_lazy():
+            execute_model_kwargs.update({
+                "bypass_hpu_graphs": not use_graphs,
+                "warmup_mode": warmup_mode
+            })
 
         htorch.core.mark_step()
         if self.is_driver_worker:
@@ -1415,9 +1448,8 @@ def execute_model(
         with self.profiler.record_event('internal', model_event_name):
             hidden_states = self.model.forward(
                 **execute_model_kwargs,
-                selected_token_indices=sampling_metadata.
-                selected_token_indices,
-                bypass_hpu_graphs=not use_graphs)
+                selected_token_indices=sampling_metadata.selected_token_indices
+            )
 
         # Compute the logits.
         with self.profiler.record_event(
@@ -1459,3 +1491,16 @@ def execute_model(
                 is_prompt=is_prompt)
             self.profiler.record_counter(self.event_start, counters)
         return [output]
+
+    def shutdown_inc(self):
+        print('inc shutdown')
+        if (model_config := getattr(self, "model_config", None)) and \
+                         getattr(model_config, "quantization", None) == 'inc':
+            print('inc shutdown start')
+            from neural_compressor.torch.quantization import (
+                finalize_calibration)
+            finalize_calibration(self.model.model)
+            print('inc shutdown')
+
+    def __del__(self):
+        self.shutdown_inc()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index f3fdc4dcc63c6..87122c03d3c8f 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -91,6 +91,16 @@ def __init__(
         # Initialize gpu_cache as embedding models don't initialize kv_caches
         self.hpu_cache: Optional[List[List[torch.tensor]]] = None
 
+    def _set_env_vars(self):
+        local_rank = self.local_rank
+        if self.parallel_config.world_size == 1:
+            local_rank = -1
+        import os
+        os.environ["LOCAL_RANK"] = str(local_rank)
+        os.environ["ID"] = str(local_rank)
+        os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size)
+        os.environ["RANK"] = str(self.rank)
+
     def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
             self.device = torch.device("hpu")
@@ -99,6 +109,8 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
+        if self.model_config.quantization == 'inc':
+            self._set_env_vars()
         init_worker_distributed_environment(self.parallel_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
@@ -211,6 +223,9 @@ def _warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def finish_measurements(self):
+        self.model_runner.finish_measurements()
+
     @property
     def do_metadata_broadcast(self) -> bool:
         return self.parallel_config.tensor_parallel_size > 1
@@ -288,6 +303,12 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
     def list_prompt_adapters(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for HPU backend.")
 
+    def shutdown_inc(self):
+        self.model_runner.shutdown_inc()
+
+    def __del__(self):
+        self.shutdown_inc()
+
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len

From 8185d760325a7699c5c07f7cd0e28d443a36051b Mon Sep 17 00:00:00 2001
From: Mohit Deopujari <mdeopujari@habana.ai>
Date: Sun, 18 Aug 2024 23:30:38 -0700
Subject: [PATCH 128/341] [Doc][BugFix] Update setup instructions and reference
 links (#191)

1. Replaced the non-working setup instruction with the correct command.
2. Fixed broken links and updated references in documentation.
---
 README_GAUDI.md                                 |  6 +++---
 .../getting_started/gaudi-installation.rst      | 17 ++++-------------
 docs/source/getting_started/quickstart.rst      |  2 +-
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 9ea30a2e43f69..91bcbe49405eb 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -62,16 +62,16 @@ following:
 $ git clone https://github.com/HabanaAI/vllm-fork.git
 $ cd vllm-fork
 $ git checkout habana_main
-$ python setup.py develop
+$ pip install -e .
 ```
 
 Supported Features
 ==================
 
 -   [Offline batched
-    inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference)
+    inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference)
 -   Online inference via [OpenAI-Compatible
-    Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server)
+    Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server)
 -   HPU autodetection - no need to manually select device within vLLM
 -   Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 -   Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index ddbac022a8d9d..b3234d10b3115 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -30,7 +30,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
    $ pip list | grep neural # verify that neural_compressor is installed
 
 Refer to `Intel Gaudi Software Stack
-Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
+Verification <https://docs.habana.ai/en/latest/Installation_Guide/Platform_Upgrade_and_Unboxing.html#system-verifications-and-final-tests>`__
 for more details.
 
 Run Docker Image
@@ -51,15 +51,6 @@ Use the following commands to run a Docker image:
 Build and Install vLLM
 ---------------------------
 
-To build and install vLLM from source, run:
-
-.. code:: console
-
-   $ git clone https://github.com/vllm-project/vllm.git
-   $ cd vllm
-   $ python setup.py develop
-
-
 Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
 
 .. code:: console
@@ -67,16 +58,16 @@ Currently, the latest features and performance optimizations are developed in Ga
    $ git clone https://github.com/HabanaAI/vllm-fork.git
    $ cd vllm-fork
    $ git checkout habana_main
-   $ python setup.py develop
+   $ pip install -e .
 
 
 Supported Features
 ==================
 
 -  `Offline batched
-   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
+   inference <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference>`__
 -  Online inference via `OpenAI-Compatible
-   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
+   Server <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server>`__
 -  HPU autodetection - no need to manually select device within vLLM
 -  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 -  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 89bdc247c5e8e..8cfde76adf5fa 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -9,7 +9,7 @@ This guide shows how to use vLLM to:
 * build an API server for a large language model;
 * start an OpenAI-compatible API server.
 
-Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
+Be sure to complete the `Gaudi installation instructions <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/gaudi-installation.rst#run-docker-image>`_ before continuing with this guide.
 
 .. note::
 

From f7dd91d88e6b9e68479af0817431949f665507a7 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Mon, 19 Aug 2024 00:46:21 -0700
Subject: [PATCH 129/341] split gptbigcode forward (#194)

---
 vllm/model_executor/models/gpt_bigcode.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index fc4e13bbb0e68..3ae3c8c8f712c 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -39,6 +39,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SamplerOutput
 
 from .interfaces import SupportsLoRA
@@ -224,9 +225,14 @@ def forward(
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
 
+        if current_platform.is_hpu():
+            import habana_frameworks.torch as htorch
+            htorch.core.mark_step()
         for i in range(len(self.h)):
             layer = self.h[i]
             hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
+            if current_platform.is_hpu():
+                htorch.core.mark_step()
 
         hidden_states = self.ln_f(hidden_states)
         return hidden_states

From 275e3250ba6ed8cc13b2d6e4928db73df420e64b Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Mon, 19 Aug 2024 11:43:41 -0700
Subject: [PATCH 130/341] Enable FusedSDPA for prompt attention with
 VLLM_PROMPT_USE_FUSEDSDPA (#168)

---
 vllm/attention/backends/habana_attn.py | 29 +++++++++----
 vllm/hpu/ops.py                        | 58 ++++++++++++++++++++------
 vllm/worker/habana_model_runner.py     |  7 ++--
 3 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 7a867e79b203d..2259630fa10b7 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -2,6 +2,7 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
+import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Type
 
@@ -166,6 +167,12 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
+        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                              '0').lower() in ['1', 'true']
+        if self.prefill_usefusedsdpa:
+            assert alibi_slopes is None, \
+                'Prefill with FusedSDPA not supported with alibi slopes!'
+
         suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
             raise ValueError(
@@ -223,15 +230,18 @@ def forward(
         if attn_metadata.is_prompt:
             # Prompt run.
             if kv_cache is None or attn_metadata.block_tables.numel() == 0:
-                # TODO: move this outside of model
-                assert attn_metadata.attn_bias is not None, \
-                       'attn_bias must be set before calling model.forward!'
-                attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None and \
-                   self.position_bias is not None:
-                    attn_bias.add_(self.position_bias[:, :,
-                                                      -attn_bias.size(2):,
-                                                      -attn_bias.size(3):])
+                if not self.prefill_usefusedsdpa:
+                    # TODO: move this outside of model
+                    assert attn_metadata.attn_bias is not None, \
+                        'attn_bias must be set before calling model.forward!'
+                    attn_bias = attn_metadata.attn_bias
+                    if self.alibi_slopes is not None and \
+                        self.position_bias is not None:
+                        attn_bias.add_(self.position_bias[:, :,
+                                                          -attn_bias.size(2):,
+                                                          -attn_bias.size(3):])
+                else:
+                    attn_bias = None
 
                 query_shape = (batch_size, seq_len, self.num_heads,
                                self.head_size)
@@ -247,6 +257,7 @@ def forward(
                     matmul_qk_op=self.matmul_qk,
                     softmax_op=self.softmax,
                     matmul_av_op=self.matmul_av,
+                    valid_seq_lengths=attn_metadata.seq_lens_tensor,
                 )
                 output = out.reshape(batch_size, seq_len, hidden_size)
             else:
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 23f6964723d3f..2af5634a8d1a6 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -21,6 +21,13 @@
 except ImportError:
     logger.warning("Could not import HPU FusedRMSNorm kernel. "
                    "vLLM will use forward_native implementation of RMSNorm.")
+HPUFusedSDPA = None
+try:
+    from habana_frameworks.torch.hpex.kernels import FusedSDPA
+    HPUFusedSDPA = FusedSDPA
+except ImportError:
+    logger.warning("Could not import HPU FusedSDPA kernel. "
+                   "vLLM will use native implementation.")
 
 PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
 
@@ -126,6 +133,21 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
     return final_hidden_states.view(-1, D)
 
 
+#TODO: remove after fusedsdpa fix for query_head != kv_head
+def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+    The kv go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = kv.shape
+    if n_rep == 1:
+        return kv
+    kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
+                                     head_dim)
+    return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
 def prompt_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -136,24 +158,36 @@ def prompt_attention(
     matmul_qk_op=torch.matmul,
     softmax_op=torch.softmax,
     matmul_av_op=torch.matmul,
+    valid_seq_lengths: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     query = query.transpose(1, 2)
     key = key.transpose(1, 2)
     value = value.transpose(1, 2)
     query_heads = query.size(1)
     kv_heads = key.size(1)
-    if query_heads != kv_heads:
-        query = query.unflatten(1, (kv_heads, -1))
-        key = key.unflatten(1, (kv_heads, 1))
-        value = value.unflatten(1, (kv_heads, 1))
+    if attn_bias is not None or HPUFusedSDPA is None:
+        if query_heads != kv_heads:
+            query = query.unflatten(1, (kv_heads, -1))
+            key = key.unflatten(1, (kv_heads, 1))
+            value = value.unflatten(1, (kv_heads, 1))
+            if attn_bias is not None:
+                attn_bias = attn_bias.unsqueeze(2)
+        attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2))
         if attn_bias is not None:
-            attn_bias = attn_bias.unsqueeze(2)
-    attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2))
-    if attn_bias is not None:
-        attn_weights.add_(attn_bias)
-    attn_weights = softmax_op(attn_weights, dim=-1)
-    attn_weights = matmul_av_op(attn_weights, value)
-    if query_heads != kv_heads:
-        attn_weights = attn_weights.flatten(1, 2)
+            attn_weights.add_(attn_bias)
+        attn_weights = softmax_op(attn_weights, dim=-1)
+        attn_weights = matmul_av_op(attn_weights, value)
+        if query_heads != kv_heads:
+            attn_weights = attn_weights.flatten(1, 2)
+    else:
+        #TODO: remove after fusedsdpa fix for query_heads != kv_heads
+        if query_heads != kv_heads:
+            key = repeat_kv(key, int(query_heads // kv_heads))
+            value = repeat_kv(value, int(query_heads // kv_heads))
+        softmax_mode = 'fast'
+        recompute_mode = True
+        attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True,
+                                       scale, softmax_mode, recompute_mode,
+                                       valid_seq_lengths, 'right')
     attn_weights = attn_weights.transpose(1, 2)
     return attn_weights
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 72aba42ae8553..e52b61539b540 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -151,6 +151,9 @@ class HpuModelAdapter():
 
     def __init__(self, model, enforce_eager):
         self.model = model
+        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                               '0').lower() in ['1', 'true']
+
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
@@ -159,7 +162,7 @@ def __init__(self, model, enforce_eager):
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
         prefill_metadata = attn_metadata
-        if prefill_metadata is None:
+        if prefill_metadata is None or self.prefill_use_fusedsdpa:
             return attn_metadata
 
         seq_lens_t = prefill_metadata.seq_lens_tensor
@@ -599,7 +602,6 @@ def _prepare_prompt(
             # actual prompt lens
             context_lens.append(context_len)
             query_lens.append(seq_len - context_len)
-
             input_tokens.append(prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
@@ -672,7 +674,6 @@ def _prepare_prompt(
         max_prompt_len = max(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
-
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
                                             pad=0,

From 55ea6589c7dfa27ca4f07271cf73166971f9f8fe Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Tue, 20 Aug 2024 18:35:38 +0530
Subject: [PATCH 131/341] Enable LoRA support for HPU (#170)

This PR enables LoRA support in HPU.

* Implemented custom BGMV for LoRA modules using index-select operator.
* Support for both single and multi card scenarios has been tested

---------

Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Co-authored-by: Himangshu Lahkar <hlahkar@habana.ai>
---
 examples/lora_inference_hpu.py     |  47 ++++++
 tests/conftest.py                  |   8 ++
 tests/lora/conftest.py             |   8 +-
 tests/lora/test_llama_hpu.py       | 100 +++++++++++++
 tests/lora/test_lora_hpu.py        | 221 +++++++++++++++++++++++++++++
 tests/lora/test_multilora_hpu.py   | 130 +++++++++++++++++
 tests/lora/utils.py                |  11 +-
 vllm/executor/habana_executor.py   |  27 ++--
 vllm/hpu/ops.py                    |  75 ++++++++++
 vllm/lora/layers.py                |  31 +++-
 vllm/lora/models.py                |  44 ++++--
 vllm/utils.py                      |   6 +
 vllm/worker/habana_model_runner.py | 205 ++++++++++++++++++++------
 vllm/worker/habana_worker.py       |  29 ++--
 14 files changed, 848 insertions(+), 94 deletions(-)
 create mode 100644 examples/lora_inference_hpu.py
 create mode 100644 tests/lora/test_llama_hpu.py
 create mode 100644 tests/lora/test_lora_hpu.py
 create mode 100644 tests/lora/test_multilora_hpu.py

diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py
new file mode 100644
index 0000000000000..b8154a29a82bb
--- /dev/null
+++ b/examples/lora_inference_hpu.py
@@ -0,0 +1,47 @@
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+llm = LLM(model="meta-llama/Llama-2-7b-hf",
+          enable_lora=True,
+          max_num_seqs=2,
+          dtype='bfloat16')
+
+sampling_params = SamplingParams(temperature=0,
+                                 max_tokens=1024,
+                                 stop=["[/assistant]"])
+
+prompts = [
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+]
+
+expected_output = [
+    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ",  # noqa: E501
+    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+]
+
+outputs = llm.generate(prompts,
+                       sampling_params,
+                       lora_request=LoRARequest("sql_adapter", 1,
+                                                sql_lora_path))
+
+for i, output in enumerate(outputs):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    match = expected_output[i] == generated_text
+    if not match:
+        print(
+            f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}"  # noqa: E501
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
index 59510075b0063..cfb7cf56b519a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -590,9 +590,17 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
     yield caplog
 
 
+def is_hpu():
+    from importlib import util
+    return util.find_spec('habana_frameworks') is not None
+
+
 @pytest.fixture(scope="session")
 def num_gpus_available():
     """Get number of GPUs without initializing the CUDA context
     in current process."""
 
+    if is_hpu():
+        return torch.hpu.device_count()
+
     return cuda_device_count_stateless()
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 0bcae5b0c96dc..3e4c8be6dbaa3 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -48,13 +48,19 @@ class ContextInfo(TypedDict):
 }]
 
 
+def is_hpu():
+    from importlib import util
+    return util.find_spec('habana_frameworks') is not None
+
+
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    torch.cuda.empty_cache()
+    if not is_hpu():
+        torch.cuda.empty_cache()
     ray.shutdown()
 
 
diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py
new file mode 100644
index 0000000000000..dfd551f2ca043
--- /dev/null
+++ b/tests/lora/test_llama_hpu.py
@@ -0,0 +1,100 @@
+from multiprocessing import Process
+from typing import List
+
+from conftest import cleanup
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def _test_llama_lora(sql_lora_files, tp_size):
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   dtype='float32',
+                   tensor_parallel_size=tp_size)
+
+    expected_no_lora_output = [
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
+    ]
+    expected_lora_output = [
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+    ]
+
+    print("lora adapter created")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 1")
+    assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
+
+    print("no lora")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 2")
+    assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
+
+    print("removing lora")
+    cleanup()
+
+
+def test_llama_lora_1x(sql_lora_files):
+    p = Process(target=_test_llama_lora, args=(sql_lora_files, 1))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
+
+
+def test_llama_lora_2x(sql_lora_files):
+    # Work-around to resolve stalling issue in multi-card scenario
+    p = Process(target=_test_llama_lora, args=(sql_lora_files, 2))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
+
+
+def test_llama_lora_4x(sql_lora_files):
+    # Work-around to resolve stalling issue in multi-card scenario
+    p = Process(target=_test_llama_lora, args=(sql_lora_files, 4))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
new file mode 100644
index 0000000000000..ddbab66e166b3
--- /dev/null
+++ b/tests/lora/test_lora_hpu.py
@@ -0,0 +1,221 @@
+import pytest
+import torch
+
+from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
+
+from .utils import DummyLoRAManager
+
+TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
+QKV_TENSOR_SIZES = [
+    (8192, 1024, 1024),
+    (8192 // 8, 1024 // 8, 1024 // 8),
+    (4096, 4096, 4096),
+    (4096 // 2, 4096 // 2, 4096 // 2),
+]
+BATCH_SIZES = [8, 32, 256]
+RANKS = [8]
+DTYPES = [torch.bfloat16]
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+MAX_LORAS = 8
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora(m, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m, n], device="hpu", dtype=dtype)
+
+    manager.init_random_lora(module_name, weight, rank=rank)
+    lora = manager.get_module_lora(module_name)
+
+    input = torch.rand(k, n, device="hpu", dtype=dtype)
+    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
+
+    lora_a_stack = torch.zeros(MAX_LORAS + 1,
+                               1,
+                               lora.lora_a.shape[1],
+                               lora.lora_a.shape[0],
+                               device="hpu",
+                               dtype=dtype)
+    lora_b_stack = torch.zeros(MAX_LORAS + 1,
+                               1,
+                               lora.lora_b.shape[1],
+                               lora.lora_b.shape[0],
+                               device="hpu",
+                               dtype=dtype)
+    for i in range(MAX_LORAS):
+        lora_a_stack[i][0] = lora.lora_a.T
+        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
+
+    output = torch.zeros(k, m, device="hpu", dtype=dtype)
+    _apply_lora(input, lora_a_stack, lora_b_stack,
+                torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"),
+                output)
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora(input, lora_a_stack, lora_b_stack,
+                torch.full((len(input), ), -1, device="hpu"), output)
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
+    if m % 2 != 0:
+        pytest.skip("m must be divisible by 2")
+    if m // 2 not in TENSOR_SIZES:
+        pytest.skip("m//2 must be in TENSOR_SIZES")
+
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m // 2, n], device="hpu", dtype=dtype)
+
+    manager.init_random_lora(module_name + "1", weight, rank=rank)
+    lora_1 = manager.get_module_lora(module_name + "1")
+    manager.init_random_lora(module_name + "2", weight, rank=rank)
+    lora_2 = manager.get_module_lora(module_name + "2")
+
+    input = torch.rand(k, n, device="hpu", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
+        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_1.lora_a.shape[1],
+                    lora_1.lora_a.shape[0],
+                    device="hpu",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_1.lora_b.shape[1],
+                    lora_1.lora_b.shape[0],
+                    device="hpu",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(MAX_LORAS):
+        lora_a_stacks[0][i][0] = lora_1.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
+        lora_a_stacks[1][i][0] = lora_2.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
+
+    output = torch.zeros(k, m, device="hpu", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
+        (m // 2, m // 2))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="hpu"),
+                              output, (m // 2, m // 2))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype)
+    weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype)
+
+    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
+    lora_q = manager.get_module_lora(module_name + "q")
+    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
+    lora_k = manager.get_module_lora(module_name + "k")
+    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
+    lora_v = manager.get_module_lora(module_name + "v")
+
+    input = torch.rand(k, n, device="hpu", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
+        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
+        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_q.lora_a.shape[1],
+                    lora_q.lora_a.shape[0],
+                    device="hpu",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_k.lora_a.shape[1],
+                    lora_k.lora_a.shape[0],
+                    device="hpu",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_q.lora_b.shape[1],
+                    lora_q.lora_b.shape[0],
+                    device="hpu",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(MAX_LORAS + 1,
+                    1,
+                    lora_k.lora_b.shape[1],
+                    lora_k.lora_b.shape[0],
+                    device="hpu",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(MAX_LORAS):
+        lora_a_stacks[0][i][0] = lora_q.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
+        lora_a_stacks[1][i][0] = lora_k.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
+        lora_a_stacks[2][i][0] = lora_v.lora_a.T
+        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
+
+    output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
+        (qkv[0], qkv[1], qkv[2]))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="hpu"),
+                              output, (qkv[0], qkv[1], qkv[2]))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py
new file mode 100644
index 0000000000000..edca64fd5a2ae
--- /dev/null
+++ b/tests/lora/test_multilora_hpu.py
@@ -0,0 +1,130 @@
+from multiprocessing import Process
+from typing import List, Optional, Tuple
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    """Create a list of test prompts with their sampling parameters.
+
+    2 requests for base model, 4 requests for the LoRA. We define 2
+    different LoRA adapters (using the same model for demo purposes).
+    """
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128), None),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        max_tokens=128), None),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora2", 2, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams,
+                                              Optional[LoRARequest]]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+    result = {}
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                result[
+                    request_output.request_id] = request_output.outputs[0].text
+    return result
+
+
+expected_output = [
+    " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194",  # noqa: E501
+    " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the",  # noqa: E501
+    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' "  # noqa: E501
+]
+
+
+def _test_llama_multilora(sql_lora_files, tp_size):
+    """Main function that sets up and runs the prompt processing."""
+    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
+                             enable_lora=True,
+                             max_loras=2,
+                             max_lora_rank=8,
+                             max_num_seqs=16,
+                             dtype='float32',
+                             tensor_parallel_size=tp_size)
+    engine = LLMEngine.from_engine_args(engine_args)
+    test_prompts = create_test_prompts(sql_lora_files)
+    results = process_requests(engine, test_prompts)
+    generated_texts = [results[key] for key in sorted(results)]
+    assert generated_texts == expected_output
+
+
+def test_llama_multilora_1x(sql_lora_files):
+    # Work-around to resolve stalling issue in multi-card scenario
+    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
+
+
+def test_llama_multilora_2x(sql_lora_files):
+    # Work-around to resolve stalling issue in multi-card scenario
+    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
+
+
+def test_llama_multilora_4x(sql_lora_files):
+    # Work-around to resolve stalling issue in multi-card scenario
+    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b73cf5bf55324..6ed985e72e6b3 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -3,6 +3,7 @@
 import torch
 
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.utils import get_device
 
 
 class DummyLoRAManager:
@@ -28,16 +29,16 @@ def init_random_lora(self,
             lora_alpha=1,
             lora_a=torch.rand([weight.shape[1], rank],
                               dtype=weight.dtype,
-                              device="cuda"),
+                              device=get_device()),
             lora_b=torch.rand([rank, weight.shape[0]],
                               dtype=weight.dtype,
-                              device="cuda"),
+                              device=get_device()),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(5,
                                                 generate_embeddings_tensor,
                                                 dtype=weight.dtype,
-                                                device="cuda")
+                                                device=get_device())
         self.set_module_lora(module_name, lora)
 
         return lora
@@ -53,8 +54,8 @@ def init_lora(self,
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([input_dim, rank], device="cuda"),
-            lora_b=torch.rand([rank, output_dim], device="cuda"),
+            lora_a=torch.rand([input_dim, rank], device=get_device()),
+            lora_b=torch.rand([rank, output_dim], device=get_device()),
             embeddings_tensor=embeddings_tensor,
         )
         self.set_module_lora(module_name, lora)
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 80f8037a2d043..baeaec5afa371 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -154,29 +154,36 @@ def execute_model(
         return output
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.add_lora(lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.remove_lora(lora_id)
 
     def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
 
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def check_health(self) -> None:
         # GPUExecutor will always be healthy as long as
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 2af5634a8d1a6..662c53486b4ca 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -191,3 +191,78 @@ def prompt_attention(
                                        valid_seq_lengths, 'right')
     attn_weights = attn_weights.transpose(1, 2)
     return attn_weights
+
+
+def dispatch_bgmv_linear(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
+    indices: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    """
+    `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices
+    stacked into single tensors, assuming same rank. HPU handles no-LoRA
+    requests using zero valued A and B tensors. These zero valued tensors are
+    appended at the end of `wa_t_all` and `wb_t_all` during initialization. For
+    custom BGMV, the corresponding `wa` and `wb` for each batch is created
+    based on the lora_index of each sample.
+
+    For example:
+        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
+        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
+        no-LoRA case. The `wa` tensor for a batch of size batch_Size will have
+        a shape of (batch_size, num_layers, hidden_dim, lora_rank)
+
+    This method avoids for-loop as well as graph breaks.
+    """
+    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
+    max_loras = wa_t_all.size(0)
+    # Wrap-around for negative indices
+    indices = indices % max_loras
+    wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+    wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+
+    x = x.unsqueeze(1)
+    out = x @ wa
+    out = out @ wb
+    out = out.squeeze(1)
+    y += out * scale
+
+
+def dispatch_bgmv_embedding(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_t_all: torch.Tensor,
+    indices: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    """
+    `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor
+    assuming same rank. HPU handles no-LoRA requests using zero valued A
+    tensor. This zero valued tensor is appended at the end of `wa_t_all` during
+    initialization. For custom BGMV, the corresponding wa for each batch is
+    created based on the lora_index of the sample.
+
+    For example:
+        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
+        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
+        no-LoRA case. The wa tensor for a batch of size batch_Size will have a
+        shape of (batch_size, num_layers, lora_rank, hidden_dim)
+
+
+    This method avoids for-loop as well as graph breaks.
+    """
+    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
+    max_loras = wa_t_all.size(0)
+    # Wrap-around for negative indices
+    indices = indices % max_loras
+    wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+
+    x = x.unsqueeze(1)
+    out = x @ wa
+    out = out.squeeze(1)
+    y += out * scale
\ No newline at end of file
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 87de285a373a2..4a45f3fda88f1 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -27,6 +27,10 @@
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
+from vllm.utils import is_hpu
+
+if is_hpu():
+    from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear
 
 if TYPE_CHECKING:
     pass
@@ -89,7 +93,11 @@ def _apply_lora(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
     indices = indices.view(-1)
-    add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
+    if is_hpu():
+        dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked,
+                             indices, 0, 1.0)
+    else:
+        add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
     return output.view_as(org_output)
 
 
@@ -127,9 +135,15 @@ def _apply_lora_packed_nslice(
     indices = indices.view(-1)
     offset_left = 0
     for slice_idx in range(len(output_slices)):
-        add_lora_slice(output, x, lora_a_stacked[slice_idx],
-                       lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left,
-                       output_slices[slice_idx])
+        if is_hpu():
+            dispatch_bgmv_linear(
+                output[:, offset_left:offset_left + output_slices[slice_idx]],
+                x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx],
+                indices, 0, 1.0)
+        else:
+            add_lora_slice(output, x, lora_a_stacked[slice_idx],
+                           lora_b_stacked[slice_idx], indices, 0, 1.0,
+                           offset_left, output_slices[slice_idx])
         offset_left += output_slices[slice_idx]
     return output.view_as(org_output)
 
@@ -330,8 +344,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
                 full_lora_a_embeddings.shape[1], -1)
-        bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
-             self.indices[:self.indices_len[0]], 0, 1.0)
+        if is_hpu():
+            dispatch_bgmv_embedding(full_output, full_lora_a_embeddings,
+                                    self.lora_b_stacked,
+                                    self.indices[:self.indices_len[0]], 0, 1.0)
+        else:
+            bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked,
+                 self.indices[:self.indices_len[0]], 0, 1.0)
         return full_output.view_as(full_output_org)
 
     @classmethod
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e1ede7d4d710a..30d2fd9502977 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -24,7 +24,7 @@
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
-from vllm.utils import is_pin_memory_available
+from vllm.utils import get_device, is_hpu, is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -93,7 +93,7 @@ def convert_mapping(
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
+                                        device=get_device(),
                                         dtype=torch.long)
     prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
@@ -118,9 +118,9 @@ def convert_mapping(
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    indices = torch.tensor(indices_list, dtype=torch.long, device=get_device())
     prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
+                                         device=get_device(),
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
@@ -131,10 +131,10 @@ def convert_mapping(
     sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
+    sampler_indices_padded = (torch.arange(
+        0, len(sampler_indices_padded), device=get_device(), dtype=torch.long)
+                              + (sampler_indices_padded *
+                                 len(sampler_indices_padded)))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:
@@ -424,20 +424,20 @@ def __init__(
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         self.base_indices = torch.empty(self.max_num_batched_tokens,
                                         dtype=torch.long,
-                                        device="cuda")
+                                        device=get_device())
         self.sampler_indices = torch.empty(self.max_num_batched_tokens,
                                            dtype=torch.long,
-                                           device="cuda")
+                                           device=get_device())
         self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
                                                   dtype=torch.long,
-                                                  device="cuda")
+                                                  device=get_device())
         self.embeddings_indices = torch.empty(2,
                                               self.max_num_batched_tokens,
                                               dtype=torch.long,
-                                              device="cuda")
+                                              device=get_device())
         self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
                                              dtype=torch.long,
-                                             device="cuda")
+                                             device=get_device())
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
@@ -465,11 +465,25 @@ def __init__(
 
     @property
     def capacity(self) -> int:
-        return self.lora_config.max_cpu_loras
+        if is_hpu():
+            # HPU handles no LoRA requests using zero valued A and B tensors.
+            # These zero valued tensors are appended at the end of A and B,
+            # making total number of loras to be lora_config.max_cpu_loras + 1.
+            # This demands the total number of max_cpu_loras to be
+            # lora_config.max_cpu_loras + 1
+            return self.lora_config.max_cpu_loras + 1
+        else:
+            return self.lora_config.max_cpu_loras
 
     @property
     def lora_slots(self) -> int:
-        return self.lora_config.max_loras
+        if is_hpu():
+            # HPU handles no LoRA requests using zero valued A and B tensors.
+            # These zero valued tensors are appended at the end of A and B,
+            # making total number of loras to be lora_config.max_cpu_loras + 1.
+            return self.lora_config.max_loras + 1
+        else:
+            return self.lora_config.max_loras
 
     @property
     def adapter_slots(self) -> int:
diff --git a/vllm/utils.py b/vllm/utils.py
index fe84253feb172..fa6e132dd3522 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -970,6 +970,12 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
+def get_device() -> str:
+    if is_hpu():
+        return "hpu"
+    return "cuda"
+
+
 def error_on_invalid_device_count_status():
     cache_entries = 0
     with contextlib.suppress(Exception):
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e52b61539b540..d129bb5cbc0ca 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -435,6 +435,23 @@ def load_model(self) -> None:
                    f"took {m_getmodel.get_summary_string()}")
             logger.info(msg)
 
+            if self.lora_config:
+                assert hasattr(self.model, "supported_lora_modules"
+                               ) and self.model.supported_lora_modules, (
+                                   "Model does not support LoRA")
+                assert hasattr(self.model, "embedding_modules"
+                               ), "Model does not have embedding_modules"
+                assert hasattr(
+                    self.model, "embedding_padding_modules"
+                ), "Model does not have embedding_padding_modules"
+                self.lora_manager = LRUCacheWorkerLoRAManager(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens,
+                    self.vocab_size, self.lora_config, self.device,
+                    self.model.embedding_modules,
+                    self.model.embedding_padding_modules)
+                self.model = self.lora_manager.create_lora_manager(self.model)
+
             if self.model_config.quantization == 'inc':
                 logger.info("Preparing model with INC..")
                 with HabanaMemoryProfiler() as m_inc:
@@ -467,35 +484,26 @@ def load_model(self) -> None:
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
-        if self.lora_config:
-            assert hasattr(self.model, "supported_lora_modules"
-                           ) and self.model.supported_lora_modules, (
-                               "Model does not support LoRA")
-            assert hasattr(
-                self.model,
-                "embedding_modules"), "Model does not have embedding_modules"
-            assert hasattr(self.model, "embedding_padding_modules"
-                           ), "Model does not have embedding_padding_modules"
-            self.lora_manager = LRUCacheWorkerLoRAManager(
-                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens, self.vocab_size,
-                self.lora_config, self.device, self.model.embedding_modules,
-                self.model.embedding_padding_modules)
-            self.model = self.lora_manager.create_lora_manager(self.model)
-
     def _use_graphs(self, batch_size, seq_len, is_prompt):
         if self.enforce_eager:
             return False
         return (batch_size, seq_len, is_prompt) in self.graphed_buckets
 
+    def _is_valid_bucket(self, bucket):
+        return bucket[0] * bucket[1] <= self.max_num_batched_tokens
+
     def _setup_buckets(self) -> None:
+        max_bucket_cfg = 64
+        if self.lora_config and \
+            max_bucket_cfg > self.max_num_batched_tokens // self.block_size:
+            max_bucket_cfg = self.max_num_batched_tokens // self.block_size
         self.prompt_bs_bucket_cfg = read_bucket_settings('prompt',
                                                          'bs',
                                                          min=1,
                                                          step=32,
                                                          max=min(
                                                              self.max_num_seqs,
-                                                             64))
+                                                             max_bucket_cfg))
         self.decode_bs_bucket_cfg = read_bucket_settings('decode',
                                                          'bs',
                                                          min=1,
@@ -520,6 +528,12 @@ def _setup_buckets(self) -> None:
         self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg,
                                              self.prompt_seq_bucket_cfg)
 
+        if self.lora_config:
+            self.prompt_buckets[:] = [
+                bucket for bucket in self.prompt_buckets
+                if self._is_valid_bucket(bucket)
+            ]
+
         msg = (f"Generated {len(self.prompt_buckets)} "
                f"prompt buckets: {list(sorted(self.prompt_buckets))}")
         logger.info(msg)
@@ -530,6 +544,11 @@ def _setup_buckets(self) -> None:
         logger.info(msg)
         self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
                                              self.decode_seq_bucket_cfg)
+        if self.lora_config:
+            self.decode_buckets[:] = [
+                bucket for bucket in self.decode_buckets
+                if self._is_valid_bucket(bucket)
+            ]
         msg = (f"Generated {len(self.decode_buckets)} decode buckets: "
                f"{list(sorted(self.decode_buckets))}")
         logger.info(msg)
@@ -606,16 +625,6 @@ def _prepare_prompt(
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
-            lora_id = seq_group_metadata.lora_int_id
-
-            if lora_id > 0:
-                lora_requests.add(seq_group_metadata.lora_request)
-
-            lora_index_mapping += [lora_id] * (seq_len - context_len)
-            lora_prompt_mapping.append(
-                [lora_id] *
-                (seq_len - context_len
-                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.multi_modal_data:
                 multi_modal_input_list.append(
@@ -674,6 +683,20 @@ def _prepare_prompt(
         max_prompt_len = max(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
+
+        for seq_group_metadata, context_len in zip(seq_group_metadata_list,
+                                                   context_lens):
+            lora_id = seq_group_metadata.lora_int_id
+
+            if lora_id > 0:
+                lora_requests.add(seq_group_metadata.lora_request)
+
+            lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
+            lora_prompt_mapping.extend(
+                [lora_id] *
+                (max_prompt_len - context_len
+                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
                                             pad=0,
@@ -1027,7 +1050,11 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         ])
         return attention_metadata
 
-    def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
+    def create_dummy_seq_group_metadata(self,
+                                        group_id,
+                                        seq_len,
+                                        is_prompt,
+                                        lora_request=None):
         sampling_params = SamplingParams(temperature=0)
         num_blocks = math.ceil(seq_len / self.block_size)
         if is_prompt:
@@ -1042,34 +1069,78 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt):
         output_token_ids = [1] * output_len
         seq_data = SequenceData(prompt_token_ids)
         seq_data.output_token_ids = output_token_ids
-        return SequenceGroupMetadata(
-            request_id=str(group_id),
-            is_prompt=(output_len == 0),
-            seq_data={group_id: seq_data},
-            sampling_params=sampling_params,
-            block_tables=block_tables,
-        )
+        return SequenceGroupMetadata(request_id=str(group_id),
+                                     is_prompt=(output_len == 0),
+                                     seq_data={group_id: seq_data},
+                                     sampling_params=sampling_params,
+                                     block_tables=block_tables,
+                                     lora_request=lora_request)
 
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         max_batch_size = self.prompt_bs_bucket_cfg[-1]
         max_seq_len = self.prompt_seq_bucket_cfg[-1]
-
-        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches)
-
-    def warmup_scenario(self, batch_size, seq_len, is_prompt,
-                        kv_caches) -> None:
+        if self.lora_config:
+            max_seq_len = self.max_num_batched_tokens // max_batch_size
+
+        self.warmup_scenario(max_batch_size,
+                             max_seq_len,
+                             True,
+                             kv_caches,
+                             is_profile_run=True)
+
+    def warmup_scenario(self,
+                        batch_size,
+                        seq_len,
+                        is_prompt,
+                        kv_caches,
+                        is_profile_run=False) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
         scenario_name = ("warmup_"
                          f"{'prompt' if is_prompt else 'decode'}_"
                          f"bs{batch_size}_"
                          f"seq{seq_len}_"
                          f"graphs{'T' if use_graphs else 'F'}")
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config and is_profile_run:
+            assert self.lora_manager is not None
+            with self.lora_manager.dummy_lora_cache():
+                for idx in range(self.lora_config.max_loras):
+                    lora_id = idx + 1
+                    dummy_lora_request = LoRARequest(
+                        lora_name=f"warmup_{lora_id}",
+                        lora_int_id=lora_id,
+                        lora_local_path="/not/a/real/path",
+                    )
+                    self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                     rank=LORA_WARMUP_RANK)
+                    dummy_lora_requests.append(dummy_lora_request)
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs else 1
+        if self.lora_config and not is_profile_run:
+            lora_mapping = LoRAMapping(
+                [0] * batch_size * seq_len,
+                [0] * batch_size * seq_len,
+            )
+            self.set_active_loras(set(), lora_mapping)
         seqs = [
-            self.create_dummy_seq_group_metadata(i, seq_len, is_prompt)
+            self.create_dummy_seq_group_metadata(
+                i,
+                seq_len,
+                is_prompt,
+                lora_request=dummy_lora_requests_per_seq[i]
+                if dummy_lora_requests_per_seq else None)
             for i in range(batch_size)
         ]
         torch.hpu.synchronize()
@@ -1080,6 +1151,37 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
         self.profiler.end()
         gc.collect()
 
+    def remove_all_loras(self):
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.remove_all_adapters()
+
+    def set_active_loras(self, lora_requests: Set[LoRARequest],
+                         lora_mapping: LoRAMapping) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
+
     def log_warmup(self, phase, i, max_i, batch_size, seq_len):
         free_mem = format_bytes(
             HabanaMemoryProfiler.current_free_device_memory())
@@ -1403,9 +1505,11 @@ def execute_model(
             raise ValueError(
                 "num_steps > 1 is not supported in HabanaModelRunner")
 
-        # NOTE(kzawora): Need to restore this after adding LoRA
-        # if self.lora_config:
-        #    self.set_active_loras(lora_requests, lora_mapping)
+        if self.lora_config:
+            assert model_input.lora_requests is not None
+            assert model_input.lora_mapping is not None
+            self.set_active_loras(model_input.lora_requests,
+                                  model_input.lora_mapping)
         input_tokens = model_input.input_tokens
         input_positions = model_input.input_positions
         attn_metadata = model_input.attn_metadata
@@ -1452,6 +1556,19 @@ def execute_model(
                 selected_token_indices=sampling_metadata.selected_token_indices
             )
 
+        if self.lora_config:
+            from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
+            property = vars(self.model.model)
+            model = list(property['_modules'].values())[0]
+            property = vars(model)
+            modules = list(property['_modules'].values())
+            for module in modules:
+                if isinstance(module, VocabParallelEmbeddingWithLoRA):
+                    for i in range(0, 4):
+                        module.indices_len[
+                            i] = sampling_metadata.selected_token_indices.numel(
+                            )
+
         # Compute the logits.
         with self.profiler.record_event(
                 'internal', ('compute_logits_'
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 87122c03d3c8f..9d083915041fe 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -174,9 +174,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_hpu_blocks = max(num_hpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        # NOTE(kzawora): Restore this once LoRA support is added
-        # if self.model_runner.lora_manager:
-        #     self.model_runner.remove_all_loras()
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
 
         gc.collect()
         return num_hpu_blocks, num_cpu_blocks
@@ -279,29 +278,33 @@ def execute_worker(self, worker_input: WorkerInput) -> None:
             self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        return self.model_runner.add_lora(lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        return self.model_runner.remove_lora(lora_id)
 
     def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        return self.model_runner.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
 
     def add_prompt_adapter(
             self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def list_prompt_adapters(self) -> Set[int]:
-        raise NotImplementedError("LoRA is not implemented for HPU backend.")
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
 
     def shutdown_inc(self):
         self.model_runner.shutdown_inc()

From 1f1e98199cc570baa3f406e3cfff0e4b95ec14d8 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Wed, 21 Aug 2024 09:33:09 +0300
Subject: [PATCH 132/341] Handle compile-mode unwrap bug for indices length fix
 in LoRA

---
 vllm/worker/habana_model_runner.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d129bb5cbc0ca..7f7f15bea86fa 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1441,6 +1441,15 @@ def get_counter_dict(self, cache_config, duration, seq_len,
         return counters
 
 
+def unwrap_model(model):
+    if isinstance(model, torch._dynamo.eval_frame.OptimizedModule):
+        return unwrap_model(model._orig_mod)
+    else:
+        model = list(vars(model)['_modules'].values())[0]
+        modules = list(vars(model)['_modules'].values())
+        return modules
+
+
 class HabanaModelRunner(
         HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
     """
@@ -1558,13 +1567,10 @@ def execute_model(
 
         if self.lora_config:
             from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
-            property = vars(self.model.model)
-            model = list(property['_modules'].values())[0]
-            property = vars(model)
-            modules = list(property['_modules'].values())
+            modules = unwrap_model(self.model.model)
             for module in modules:
                 if isinstance(module, VocabParallelEmbeddingWithLoRA):
-                    for i in range(0, 4):
+                    for i in range(0, len(module.indices_len)):
                         module.indices_len[
                             i] = sampling_metadata.selected_token_indices.numel(
                             )

From db02be889957ab94897ccc8d95181d8bb422f92a Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 22 Aug 2024 16:15:26 +0000
Subject: [PATCH 133/341] Add docker hpu for serving

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 Dockerfile.hpu | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 Dockerfile.hpu

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
new file mode 100644
index 0000000000000..b9acec2b85be4
--- /dev/null
+++ b/Dockerfile.hpu
@@ -0,0 +1,18 @@
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file

From aefd336798248d519ddc4cc5662c9aa03a9dbfad Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 27 Aug 2024 14:42:57 +0200
Subject: [PATCH 134/341] Ensure buckets do not exceed the batch token limit
 (#206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR ensures we don't capture buckets that are above the specified
token budget (as set by `max_num_batched_tokens` argument)

Example for token budget of 2048 (`--max-num-batched-tokens 2048`):
```
$ python vllm_test.py --max-num-batched-tokens 2048
WARNING 08-27 14:48:55 _custom_ops.py:14] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py:366: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
  warnings.warn(
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
INFO 08-27 14:48:56 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, weights_load_device=hpu, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=facebook/opt-125m, use_v2_block_manager=False, enable_prefix_caching=False)
generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 1.91MB/s]
INFO 08-27 14:48:57 profiler.py:62] Profiler enabled for: vllm-instance-d356a015eeb349f7a4650e00bf6ce976
WARNING 08-27 14:48:57 utils.py:566] Pin memory is not supported on HPU.
INFO 08-27 14:48:57 selector.py:85] Using HabanaAttention backend.
INFO 08-27 14:48:57 habana_model_runner.py:532] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 08-27 14:48:57 habana_model_runner.py:545] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)]
INFO 08-27 14:48:57 habana_model_runner.py:550] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048]
INFO 08-27 14:48:57 habana_model_runner.py:561] Generated 31 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)]
============================= HABANA PT BRIDGE CONFIGURATION ===========================
 PT_HPU_LAZY_MODE = 1
 PT_RECIPE_CACHE_PATH =
 PT_CACHE_FOLDER_DELETE = 0
 PT_HPU_RECIPE_CACHE_CONFIG =
 PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
 PT_HPU_LAZY_ACC_PAR_MODE = 1
 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM       : 1056398260 KB
------------------------------------------------------------------------------
INFO 08-27 14:49:00 selector.py:85] Using HabanaAttention backend.
INFO 08-27 14:49:00 loader.py:284] Loading weights on hpu ...
INFO 08-27 14:49:00 weight_utils.py:224] Using model weights format ['*.bin']
pytorch_model.bin: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 251M/251M [00:06<00:00, 35.9MB/s]
Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.15it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.15it/s]

INFO 08-27 14:49:08 habana_model_runner.py:441] Pre-loading model weights on hpu:0 took 238.9 MiB of device memory (244.4 MiB/94.62 GiB used) and 298.9 MiB of host memory (485.6 GiB/1007 GiB used)
INFO 08-27 14:49:08 habana_model_runner.py:486] Wrapping in HPU Graph took 0 B of device memory (244.4 MiB/94.62 GiB used) and 0 B of host memory (485.6 GiB/1007 GiB used)
INFO 08-27 14:49:08 habana_model_runner.py:490] Loading model weights took in total 238.9 MiB of device memory (244.4 MiB/94.62 GiB used) and 298.2 MiB of host memory (485.6 GiB/1007 GiB used)
```
We can see that no bucket exceeds 2048 tokens, and we have `(16, 128)`
as well as `(1, 2048)`. Previously, with default bucket settings, we'd
also capture `(16, 2048)`, and `(64, 2048)` cases, which should not be
allowed.


With `--max-num-batched-tokens 32768`:
```
$ python vllm_test.py --max-num-batched-tokens 32768
WARNING 08-27 14:54:39 _custom_ops.py:14] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py:366: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
  warnings.warn(
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
INFO 08-27 14:54:41 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, weights_load_device=hpu, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=facebook/opt-125m, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-27 14:54:41 profiler.py:62] Profiler enabled for: vllm-instance-be8ab3101609425ba60df601dc9de3a6
WARNING 08-27 14:54:41 utils.py:566] Pin memory is not supported on HPU.
INFO 08-27 14:54:41 selector.py:85] Using HabanaAttention backend.
INFO 08-27 14:54:41 habana_model_runner.py:533] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 08-27 14:54:41 habana_model_runner.py:546] Generated 52 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (64, 128), (64, 256), (64, 384), (64, 512)]
INFO 08-27 14:54:41 habana_model_runner.py:551] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048]
INFO 08-27 14:54:41 habana_model_runner.py:562] Generated 95 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (8, 1152), (8, 1280), (8, 1408), (8, 1536), (8, 1664), (8, 1792), (8, 1920), (8, 2048), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (16, 1152), (16, 1280), (16, 1408), (16, 1536), (16, 1664), (16, 1792), (16, 1920), (16, 2048), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (64, 128), (64, 256), (64, 384), (64, 512), (128, 128), (128, 256), (256, 128)]
============================= HABANA PT BRIDGE CONFIGURATION ===========================
 PT_HPU_LAZY_MODE = 1
 PT_RECIPE_CACHE_PATH =
 PT_CACHE_FOLDER_DELETE = 0
 PT_HPU_RECIPE_CACHE_CONFIG =
 PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
 PT_HPU_LAZY_ACC_PAR_MODE = 1
 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM       : 1056398260 KB
------------------------------------------------------------------------------
INFO 08-27 14:54:45 selector.py:85] Using HabanaAttention backend.
INFO 08-27 14:54:45 loader.py:284] Loading weights on hpu ...
INFO 08-27 14:54:45 weight_utils.py:224] Using model weights format ['*.bin']
Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.99it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.99it/s]

INFO 08-27 14:54:45 habana_model_runner.py:442] Pre-loading model weights on hpu:0 took 238.9 MiB of device memory (244.4 MiB/94.62 GiB used) and 279.7 MiB of host memory (485.8 GiB/1007 GiB used)
INFO 08-27 14:54:46 habana_model_runner.py:487] Wrapping in HPU Graph took 0 B of device memory (244.4 MiB/94.62 GiB used) and 48 KiB of host memory (485.8 GiB/1007 GiB used)
INFO 08-27 14:54:46 habana_model_runner.py:491] Loading model weights took in total 238.9 MiB of device memory (244.4 MiB/94.62 GiB used) and 279.6 MiB of host memory (485.8 GiB/1007 GiB used)
```

Max model length (2048) is not exceeded for low batch, as seen in `(1,
2048)` bucket, but we can still get high batch sizes captured up to 32k
tokens, as seen in `(256, 128)` bucket.
---
 vllm/worker/habana_model_runner.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 7f7f15bea86fa..62a9e814a5ac4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -94,10 +94,16 @@ def warmup_range(config: Tuple[int, int, int]):
     return list(ramp_up_tw) + list(stable)
 
 
-def warmup_buckets(bs_bucket_config, seq_bucket_config):
+def warmup_buckets(bs_bucket_config, seq_bucket_config,
+                   max_num_batched_tokens):
     buckets = itertools.product(warmup_range(bs_bucket_config),
                                 warmup_range(seq_bucket_config))
-    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+    # Remove buckets exceeding batch token budget
+    filtered_buckets = filter(
+        lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
+        buckets)
+    return list(
+        sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
 
 
 def next_pow2(value: int):
@@ -526,7 +532,8 @@ def _setup_buckets(self) -> None:
                f"seq:{self.prompt_seq_bucket_cfg}")
         logger.info(msg)
         self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg,
-                                             self.prompt_seq_bucket_cfg)
+                                             self.prompt_seq_bucket_cfg,
+                                             self.max_num_batched_tokens)
 
         if self.lora_config:
             self.prompt_buckets[:] = [
@@ -543,7 +550,8 @@ def _setup_buckets(self) -> None:
                f"seq:{self.decode_seq_bucket_cfg}")
         logger.info(msg)
         self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
-                                             self.decode_seq_bucket_cfg)
+                                             self.decode_seq_bucket_cfg,
+                                             self.max_num_batched_tokens)
         if self.lora_config:
             self.decode_buckets[:] = [
                 bucket for bucket in self.decode_buckets

From 2ab316db5f9f5f2944cbac68132769411e4833de Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <sasarkar@habana.ai>
Date: Mon, 19 Aug 2024 05:43:28 +0000
Subject: [PATCH 135/341] Initial commit

---
 tests/samplers/test_sampler.py        |  61 +++++++++++++-
 vllm/model_executor/layers/sampler.py | 112 +++++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 vllm/model_executor/layers/sampler.py

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 9572588ce6e53..9d0ecb820548e 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -7,7 +7,7 @@
 import torch
 from transformers import GenerationConfig, GenerationMixin
 
-from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import ApplyToppTopkScalar, Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -700,3 +700,62 @@ def test_sampling_params(sampling_params: List[SamplingParams]):
 
     assert tokens1[0] == tokens2[1]
     assert tokens1[1] == tokens2[0]
+
+
+def test_topk_topk_scalar():
+    obj1 = ApplyToppTopkScalar(2)
+    assert ApplyToppTopkScalar._padded_k == 0
+    x = torch.tensor([[9, 9, 8, 8, 8, 8, 7, 7, 7.0],
+                      [10, 10, 9, 9, 9, 8, 5, 5, 5]])
+
+    retval1 = obj1(x, p=0.9, k=5)
+    ninf = -float("inf")
+    expected1 = torch.tensor([[9., 9., 8., 8., 8., 8., ninf, ninf, ninf],
+                              [10., 10., 9., 9., 9., ninf, ninf, ninf, ninf]])
+    assert torch.all(retval1 == expected1).item()
+    assert ApplyToppTopkScalar._padded_k == 9
+
+    obj2 = ApplyToppTopkScalar(2)
+    assert obj2._padded_k == 9
+
+    x = torch.tensor([[2, 2, 9, 9, 2, 2, 1, 1, 1.0],
+                      [10, 9, 9, 5, 9, 9, 5, 9, 10]])
+    retval2 = obj2(x, p=0.9, k=5)
+    expected2 = torch.tensor(
+        [[ninf, ninf, 9., 9., ninf, ninf, ninf, ninf, ninf],
+         [10., ninf, 9., ninf, 9., 9., ninf, 9., 10.]])
+    assert torch.all(retval2 == expected2).item()
+    assert obj2._padded_k == 9
+
+    retval3 = obj2(x, p=1.0, k=5)
+    expected3 = torch.tensor([[2., 2., 9., 9., 2., 2., ninf, ninf, ninf],
+                              [10., 9., 9., ninf, 9., 9., ninf, 9., 10.]])
+
+    assert torch.all(retval3 == expected3).item()
+
+    # this should not be done in general, doing it here for testing purposes
+    ApplyToppTopkScalar._padded_k = 0
+    x = torch.tensor([[1, 1, 1, 9, 8, 1, 1, 1, 1.0],
+                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
+    obj3 = ApplyToppTopkScalar(2)
+    retval4 = obj3(x, p=0.9, k=2)
+    expected4 = torch.tensor(
+        [[ninf, ninf, ninf, 9., 8., ninf, ninf, ninf, ninf],
+         [2., ninf, 2., 2., ninf, ninf, ninf, ninf, ninf]])
+    assert torch.all(retval4 == expected4).item()
+    assert obj3._padded_k == 4
+    y = torch.tensor([[8, 8, 8, 9, 8, 1, 1, 1, 1.0],
+                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
+    retval5 = obj3(y, p=0.9, k=2)
+    assert obj3._padded_k == 8
+    expected5 = torch.tensor([[8., 8., 8., 9., 8., ninf, ninf, ninf, ninf],
+                              [2., ninf, 2., 2., ninf, ninf, ninf, ninf,
+                               ninf]])
+    assert torch.all(retval5 == expected5).item()
+    y = torch.tensor([[8, 8, 8, 9, 8, 8, 1, 1, 1.0],
+                      [2, 1, 2, 2, 3, 1, 1, 1, 1]])
+    retval6 = obj3(y, p=0.9, k=2)
+    expected6 = torch.tensor([[8., 8., 8., 9., 8., 8., ninf, ninf, ninf],
+                              [2., ninf, 2., 2., 3., ninf, ninf, ninf, ninf]])
+    assert torch.all(retval6 == expected6).item()
+    assert obj3._padded_k == 8
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
old mode 100644
new mode 100755
index 6632b1c434582..6cb8971534cd3
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,5 +1,6 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+import math
 from math import inf
 from typing import Dict, List, Optional, Tuple
 
@@ -77,6 +78,13 @@ def _init_sampling_tensors(
         self._do_penalties = do_penalties
         self._do_top_p_top_k = do_top_p_top_k
         self._do_min_p = do_min_p
+        self._top_p_scalar = sampling_tensors.top_ps[0].item()
+        self._top_k_scalar = sampling_tensors.top_ks[0].item()
+        scalar_p = torch.all(sampling_tensors.top_ps == self._top_p_scalar)
+        scalar_k = torch.all(sampling_tensors.top_ks == self._top_k_scalar)
+        self._scalar_p_and_k = (scalar_p and scalar_k).item()
+        if self._scalar_p_and_k and self._do_top_p_top_k:
+            self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
 
     def forward(
         self,
@@ -122,8 +130,13 @@ def forward(
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
         if do_top_p_top_k:
-            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
-                                        sampling_tensors.top_ks)
+            if self._scalar_p_and_k:
+                logits = self._apply_top_k_top_p_opt(logits,
+                                                     self._top_p_scalar,
+                                                     self._top_k_scalar)
+            else:
+                logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                            sampling_tensors.top_ks)
 
         if do_min_p:
             logits = _apply_min_p(logits, sampling_tensors.min_ps)
@@ -198,6 +211,101 @@ def _get_bin_counts_and_mask(
     return bin_counts, mask
 
 
+class ApplyToppTopkScalar():
+    """
+    The original implementation of _apply_top_k_top_p is more general
+    as it uses vector topp, topk
+    However in a lot of cases, topp and topk is same for all batch elements
+    For such "scalar" topp, topk cases, we can use this class
+
+    The main optimizations in this class is:
+    Use topk instead of sort, which is much faster especially for small k.
+    However just using topk might not suffice in cases as shown below
+    Consider a tensor: 9 9 8 8 8 8 7 7 7
+    Topk, with k=5, on this yields 9 9 8 8 8
+    The value "8" is on the boundary, hence the last "8" gets snipped off
+    However the original implementation accepts all the "8"s,
+    so it should output:
+    9 9 8 8 8 8 (6 values, even though k=5)
+    To ensure these semantics, we perform topk with _padded_k elements
+    If we find more boundary elements left over,
+    then we keep incrementing _padded_k
+    and in future calls use the expanded value of __padded_k
+
+    The increments to _padded_k should be done
+    with value > 1 to prevent excessive recompilations
+    due to dynamic shapes (the output shape of the topk)
+
+    The main logic of this is in __call__
+    This is a class instead of a function, just to keep track of
+    the monotonic non-decreasing state _padded_k
+    """
+    _padded_k = 0
+
+    def __init__(self, increment: int):
+        self._increment = increment
+
+    def __call__(self, logits: torch.Tensor, p: float, k: int):
+        if k > ApplyToppTopkScalar._padded_k:
+            ApplyToppTopkScalar._padded_k = min(k + self._increment,
+                                                logits.shape[1])
+
+        vals, idx = torch.topk(logits, k=ApplyToppTopkScalar._padded_k, \
+                    dim=1, sorted=True)
+
+        # this "if" checks if we have bucketed so much that
+        # we have padded k upto shape of logits
+        if ApplyToppTopkScalar._padded_k != logits.shape[1]:
+            smallest_of_top_k = vals[:, k - 1]
+            num_duplicates_of_smallest_of_topk = torch.sum(
+                logits == smallest_of_top_k.unsqueeze(1), 1)
+            max_num_duplicates_of_smallest_of_topk = torch.max(
+                num_duplicates_of_smallest_of_topk).item()
+
+            # there are n repeats for a border
+            # (border meaning the smallest value of the top k).
+            # we do not know if only 1 or 2 or (n-1)
+            # of them lie outside the kth border,
+            # so we choose to conservatively increase by n-1
+            # when num_duplicates > _padded_k - k
+            if max_num_duplicates_of_smallest_of_topk - 1 > (
+                    ApplyToppTopkScalar._padded_k - k):
+                incr = int(
+                    math.ceil((max_num_duplicates_of_smallest_of_topk - 1) /
+                              self._increment) * self._increment)
+                # this while loop should be traversed at most twice,
+                # because we dont increment by self._increment and retry
+                # instead we compute incr in one go
+                ApplyToppTopkScalar._padded_k = min(
+                    ApplyToppTopkScalar._padded_k + incr, logits.shape[1])
+
+                # recompute topk with expanded padded_k
+                vals, idx = torch.topk(logits, \
+                            k=ApplyToppTopkScalar._padded_k, \
+                            dim=1, sorted=True)
+
+        idx = torch.fliplr(idx)
+        vals = torch.fliplr(vals)
+
+        top_k_smallest_val_idx = vals.size(1) - k
+        top_k_mask = vals[:, top_k_smallest_val_idx].unsqueeze(1)
+        top_k_mask = vals < top_k_mask
+        vals.masked_fill_(top_k_mask, -float("inf"))
+
+        probs_sort = vals.softmax(dim=-1)
+        probs_sum = probs_sort.cumsum(dim=-1)
+        top_p_mask = probs_sum <= (1 - p)
+        top_p_mask[:, -1] = False
+        vals.masked_fill_(top_p_mask, -float("inf"))
+
+        new_logits = torch.full(logits.shape,
+                                -float("inf"),
+                                device=logits.device)
+        new_logits.scatter_(1, idx, vals.to(new_logits.dtype))
+
+        return new_logits
+
+
 def _apply_min_tokens_penalty(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,

From 9abadba502916eeb0432c6a8c300e09d0c3a5a48 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 28 Aug 2024 11:39:33 +0200
Subject: [PATCH 136/341] Make max_num_batched_tokens behavior more verbose,
 add legacy mode (#208)

Addressing issues from https://github.com/HabanaAI/vllm-fork/pull/207
Now, filtering behavior is more verbose, handling common errors and
displaying numbers of omitted buckets due to token budget (in debug log
level, buckets are printed):

```
INFO 08-27 20:57:27 profiler.py:62] Profiler enabled for: vllm-instance-1ab4f6c4d726480d8825044cf74e9af1
WARNING 08-27 20:57:27 utils.py:566] Pin memory is not supported on HPU.
INFO 08-27 20:57:27 selector.py:85] Using HabanaAttention backend.
INFO 08-27 20:57:27 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 08-27 20:57:27 habana_model_runner.py:576] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)]
INFO 08-27 20:57:27 habana_model_runner.py:581] Omitted 33 prompt buckets due to exceeded token budget (max_num_batched_tokens=2048)
INFO 08-27 20:57:27 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048]
INFO 08-27 20:57:27 habana_model_runner.py:600] Generated 31 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)]
INFO 08-27 20:57:27 habana_model_runner.py:605] Omitted 113 decode buckets due to exceeded token budget (max_num_batched_tokens=2048)
```

Legacy mode was also added, which throws a nasty error message whenever
token budget is set too low, but then it omits filtering and works as it
did previously (ran with ``VLLM_DECODE_BS_BUCKET_MIN=128
VLLM_DECODE_SEQ_BUCKET_MIN=1024 python vllm_test.py
--max-num-batched-tokens=2048``):

```
INFO 08-27 21:01:02 profiler.py:62] Profiler enabled for: vllm-instance-51f60d3978d347e992436f1dc0aa4702
WARNING 08-27 21:01:02 utils.py:566] Pin memory is not supported on HPU.
INFO 08-27 21:01:02 selector.py:85] Using HabanaAttention backend.
INFO 08-27 21:01:02 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 08-27 21:01:02 habana_model_runner.py:576] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)]
INFO 08-27 21:01:02 habana_model_runner.py:581] Omitted 33 prompt buckets due to exceeded token budget (max_num_batched_tokens=2048)
INFO 08-27 21:01:02 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[128, 128, 256], seq:[1024, 128, 2048]
ERROR 08-27 21:01:02 habana_model_runner.py:128] The current bucketing configuration (min, step, max_warmup): bs:[128, 128, 256], seq:[1024, 128, 2048] cannot be used with specified max_num_batched_tokens (2048), as the smallest bucket (16384) would exceed token budget. Please increase max_num_batched_tokens or decrease bucket minimum Ignoring max_num_batched_tokens at risk of out-of-memory errors.
INFO 08-27 21:01:02 habana_model_runner.py:600] Generated 32 decode buckets: [(128, 128), (128, 256), (128, 384), (128, 512), (128, 640), (128, 768), (128, 896), (128, 1024), (128, 1152), (128, 1280), (128, 1408), (128, 1536), (128, 1664), (128, 1792), (128, 1920), (128, 2048), (256, 128), (256, 256), (256, 384), (256, 512), (256, 640), (256, 768), (256, 896), (256, 1024), (256, 1152), (256, 1280), (256, 1408), (256, 1536), (256, 1664), (256, 1792), (256, 1920), (256, 2048)]
INFO 08-27 21:01:02 habana_model_runner.py:605] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=2048)
```
---
 vllm/worker/habana_model_runner.py | 70 +++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 62a9e814a5ac4..6627ba1ea5643 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -96,14 +96,44 @@ def warmup_range(config: Tuple[int, int, int]):
 
 def warmup_buckets(bs_bucket_config, seq_bucket_config,
                    max_num_batched_tokens):
-    buckets = itertools.product(warmup_range(bs_bucket_config),
-                                warmup_range(seq_bucket_config))
+    buckets = list(
+        itertools.product(warmup_range(bs_bucket_config),
+                          warmup_range(seq_bucket_config)))
+    if len(buckets) == 0:
+        msg = ("No buckets could be captured with following config "
+               f"(min, step, max_warmup): "
+               f"bs:{bs_bucket_config}, "
+               f"seq:{seq_bucket_config}")
+        raise ValueError(msg)
+
     # Remove buckets exceeding batch token budget
-    filtered_buckets = filter(
-        lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
-        buckets)
-    return list(
+    filtered_buckets = list(
+        filter(lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
+               buckets))
+
+    if len(filtered_buckets) == 0:
+        # legacy case - we can handle this if we ignore max_num_batched_tokens
+        min_bucket_bs, min_bucket_seq = min(buckets,
+                                            key=lambda b: (b[0] * b[1]))
+        min_reqd_budget = min_bucket_bs * min_bucket_seq
+        msg = (
+            "The current bucketing configuration "
+            f"(min, step, max_warmup): "
+            f"bs:{bs_bucket_config}, "
+            f"seq:{seq_bucket_config} cannot be used with specified "
+            f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
+            f"smallest bucket ({min_reqd_budget}) would exceed token budget. "
+            "Please increase max_num_batched_tokens or decrease bucket minimum "
+            "Ignoring max_num_batched_tokens at risk of out-of-memory errors.")
+        logger.error(msg)
+        return list(sorted(buckets, key=lambda b:
+                           (b[0] * b[1], b[1], b[0]))), []
+
+    captured_buckets = list(
         sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+    omitted_buckets = list(
+        sorted([x for x in buckets if x not in filtered_buckets]))
+    return captured_buckets, omitted_buckets
 
 
 def next_pow2(value: int):
@@ -531,9 +561,9 @@ def _setup_buckets(self) -> None:
                f"bs:{self.prompt_bs_bucket_cfg}, "
                f"seq:{self.prompt_seq_bucket_cfg}")
         logger.info(msg)
-        self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg,
-                                             self.prompt_seq_bucket_cfg,
-                                             self.max_num_batched_tokens)
+        self.prompt_buckets, prompt_omitted_buckets = warmup_buckets(
+            self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg,
+            self.max_num_batched_tokens)
 
         if self.lora_config:
             self.prompt_buckets[:] = [
@@ -545,13 +575,21 @@ def _setup_buckets(self) -> None:
                f"prompt buckets: {list(sorted(self.prompt_buckets))}")
         logger.info(msg)
 
+        msg = (f"Omitted {len(prompt_omitted_buckets)} "
+               "prompt buckets due to exceeded token budget "
+               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
+        logger.info(msg)
+
+        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
+        logger.debug(msg)
+
         msg = ("Decode bucket config (min, step, max_warmup) "
                f"bs:{self.decode_bs_bucket_cfg}, "
                f"seq:{self.decode_seq_bucket_cfg}")
         logger.info(msg)
-        self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg,
-                                             self.decode_seq_bucket_cfg,
-                                             self.max_num_batched_tokens)
+        self.decode_buckets, decode_omitted_buckets = warmup_buckets(
+            self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg,
+            self.max_num_batched_tokens)
         if self.lora_config:
             self.decode_buckets[:] = [
                 bucket for bucket in self.decode_buckets
@@ -561,6 +599,14 @@ def _setup_buckets(self) -> None:
                f"{list(sorted(self.decode_buckets))}")
         logger.info(msg)
 
+        msg = (f"Omitted {len(decode_omitted_buckets)} "
+               "decode buckets due to exceeded token budget "
+               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
+        logger.info(msg)
+
+        msg = f"Omitted decode buckets: {list(sorted(decode_omitted_buckets))}"
+        logger.debug(msg)
+
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],

From 972f3bc8f0a1a11ab84a0edc59bc9e009e29d003 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 29 Aug 2024 00:42:19 +0000
Subject: [PATCH 137/341] remove arctic gpu hardcode

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/model_executor/models/arctic.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 49e57a847e847..6d92e7597eabf 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -131,14 +131,14 @@ def __init__(self,
                     torch.empty(self.num_experts,
                                 2 * self.intermediate_size,
                                 self.hidden_size,
-                                device="cuda",
-                                dtype=self.params_dtype))
+                                dtype=self.params_dtype),
+                    , requires_grad=False)
                 self.w2s = nn.Parameter(
                     torch.empty(self.num_experts,
                                 self.hidden_size,
                                 self.intermediate_size,
-                                device="cuda",
-                                dtype=self.params_dtype))
+                                dtype=self.params_dtype), 
+                    requires_grad=False)
             set_weight_attrs(self.ws, {
                 "weight_loader": self.weight_loader,
             })

From 778d7e64dcaf2728e9688b1c8d18bed600dab243 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 29 Aug 2024 00:42:50 +0000
Subject: [PATCH 138/341] remove dbrx gpu hardcode

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/model_executor/models/dbrx.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index d758333b22388..463003d0bba7b 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -86,17 +86,15 @@ def __init__(
                 self.num_total_experts,
                 2 * self.intermediate_size,
                 self.d_model,
-                device="cuda",
                 dtype=self.params_dtype,
-            ))
+            ), requires_grad=False)
         self.w2s = nn.Parameter(
             torch.empty(
                 self.num_total_experts,
                 self.d_model,
                 self.intermediate_size,
-                device="cuda",
                 dtype=self.params_dtype,
-            ))
+            ), requires_grad=False)
 
         set_weight_attrs(
             self.ws,

From 17cd6251924ef66246eeca224bb2cb09da23217b Mon Sep 17 00:00:00 2001
From: Vivek Goel <vgoel@habana.ai>
Date: Thu, 29 Aug 2024 11:23:05 +0530
Subject: [PATCH 139/341] Update paddings computed to adjust
 selected_token_indices (#210)

Fixes assert seen when "prompt_logprobs is not None" and BS > 1. Assert
was due to shape of paddings being added to matching
sampling_metadata.selected_token_indices shape for the case where
prompt_logprobs is configured.
---
 vllm/worker/habana_model_runner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6627ba1ea5643..a975dba6f5136 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1012,8 +1012,13 @@ def prepare_input_tensors(
         paddings = [max_len - s for s in seq_lens]
         paddings = [0] + paddings[:-1]
         paddings = list(itertools.accumulate(paddings))
+        paddings_prompt_logprobs = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            if seq_group_metadata.sampling_params.prompt_logprobs is not None \
+                              and seq_group_metadata.is_prompt:
+                paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i])
         paddings = torch.tensor(
-            paddings,
+            paddings_prompt_logprobs if paddings_prompt_logprobs else paddings,
             dtype=sampling_metadata.selected_token_indices.dtype,
             device=sampling_metadata.selected_token_indices.device)
         sampling_metadata.selected_token_indices.add_(paddings)

From f3f1f93b6af654771c20b943d556167f9765a8a8 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Fri, 30 Aug 2024 10:35:53 +0300
Subject: [PATCH 140/341] Port not warmed-up configurations log warnings

---
 vllm/worker/habana_model_runner.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a975dba6f5136..133706c18aed6 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -448,6 +448,7 @@ def __init__(
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
+        self.seen_configs = set()
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
 
@@ -1560,6 +1561,14 @@ def finish_measurements(self):
         from neural_compressor.torch.quantization import finalize_calibration
         finalize_calibration(self.model.model)
 
+    def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
+        cfg = (batch_size, seq_len, is_prompt)
+        seen = cfg in self.seen_configs
+        self.seen_configs.add(cfg)
+        if not seen and not warmup_mode:
+            phase = 'prompt' if is_prompt else 'decode'
+            logger.warning(f'Configuration: ({phase}, {batch_size}, {seq_len}) was not warmed-up!')
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1594,6 +1603,7 @@ def execute_model(
         batch_size = input_tokens.size(0)
         seq_len = self._seq_len(attn_metadata)
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
         execute_model_kwargs = {
             "input_ids": input_tokens,
             "positions": input_positions,
@@ -1605,8 +1615,7 @@ def execute_model(
             execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
             execute_model_kwargs.update({
-                "bypass_hpu_graphs": not use_graphs,
-                "warmup_mode": warmup_mode
+                "bypass_hpu_graphs": not use_graphs
             })
 
         htorch.core.mark_step()

From fd38e5d2fa7a6fb6f8c11dfb5bf8ee801b90451b Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Fri, 30 Aug 2024 11:47:58 +0300
Subject: [PATCH 141/341] Formating for log warnings

---
 vllm/worker/habana_model_runner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 133706c18aed6..0100076aec8e2 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -448,7 +448,7 @@ def __init__(
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
-        self.seen_configs = set()
+        self.seen_configs: set = set()
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
 
@@ -1567,7 +1567,8 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
         self.seen_configs.add(cfg)
         if not seen and not warmup_mode:
             phase = 'prompt' if is_prompt else 'decode'
-            logger.warning(f'Configuration: ({phase}, {batch_size}, {seq_len}) was not warmed-up!')
+            logger.warning('Configuration: (', phase, ', ', batch_size, ', ',
+                           seq_len, ') was not warmed-up!')
 
     @torch.inference_mode()
     def execute_model(
@@ -1614,9 +1615,7 @@ def execute_model(
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({
-                "bypass_hpu_graphs": not use_graphs
-            })
+            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
 
         htorch.core.mark_step()
         if self.is_driver_worker:

From a032ea2781583756f1fca8bdaa6284fa2693b841 Mon Sep 17 00:00:00 2001
From: Liran Bachar <lbachar@habana.ai>
Date: Sun, 1 Sep 2024 12:23:16 +0300
Subject: [PATCH 142/341] support loading autofp8 checkpoint

fix gaudi2 weight range to +=240
avoid cuda code in hpu path
replace _scaled_mm with hpu op
---
 vllm/_custom_ops/__init__.py                  |  75 +++++
 .../_cuda_ops.py}                             |   0
 vllm/_custom_ops/_hpu_ops.py                  | 317 ++++++++++++++++++
 vllm/{ => _custom_ops}/_ipex_ops.py           |   0
 .../compressed_tensors/compressed_tensors.py  |   5 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    |   2 +-
 .../model_executor/layers/quantization/fp8.py |  19 +-
 .../layers/quantization/utils/w8a8_utils.py   |  41 ++-
 vllm/model_executor/models/llama.py           |   7 +
 vllm/utils.py                                 |  58 +---
 vllm/worker/habana_model_runner.py            |   3 +-
 vllm/worker/habana_worker.py                  |   3 +-
 12 files changed, 458 insertions(+), 72 deletions(-)
 create mode 100644 vllm/_custom_ops/__init__.py
 rename vllm/{_custom_ops.py => _custom_ops/_cuda_ops.py} (100%)
 create mode 100644 vllm/_custom_ops/_hpu_ops.py
 rename vllm/{ => _custom_ops}/_ipex_ops.py (100%)

diff --git a/vllm/_custom_ops/__init__.py b/vllm/_custom_ops/__init__.py
new file mode 100644
index 0000000000000..2411a1465c187
--- /dev/null
+++ b/vllm/_custom_ops/__init__.py
@@ -0,0 +1,75 @@
+
+from functools import lru_cache
+
+@lru_cache(maxsize=None)
+def is_hip() -> bool:
+    return torch.version.hip is not None
+
+
+@lru_cache(maxsize=None)
+def is_cpu() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "cpu" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
+@lru_cache(maxsize=None)
+def is_openvino() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "openvino" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
+@lru_cache(maxsize=None)
+def is_neuron() -> bool:
+    try:
+        import transformers_neuronx
+    except ImportError:
+        transformers_neuronx = None
+    return transformers_neuronx is not None
+
+
+@lru_cache(maxsize=None)
+def is_hpu() -> bool:
+    from importlib import util
+    return util.find_spec('habana_frameworks') is not None
+
+
+@lru_cache(maxsize=None)
+def is_tpu() -> bool:
+    try:
+        import libtpu
+    except ImportError:
+        libtpu = None
+    return libtpu is not None
+
+
+@lru_cache(maxsize=None)
+def is_xpu() -> bool:
+    from importlib.metadata import version
+    is_xpu_flag = "xpu" in version("vllm")
+    # vllm is not build with xpu
+    if not is_xpu_flag:
+        return False
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401
+        _import_ipex = True
+    except ImportError as e:
+        logger.warning("Import Error for IPEX: %s", e.msg)
+        _import_ipex = False
+    # ipex dependency is not ready
+    if not _import_ipex:
+        logger.warning("not found ipex lib")
+        return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+if is_xpu():
+    from ._ipex_ops import *
+elif is_hpu():
+    from ._hpu_ops import *
+else:
+    from ._cuda_ops import *
\ No newline at end of file
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops/_cuda_ops.py
similarity index 100%
rename from vllm/_custom_ops.py
rename to vllm/_custom_ops/_cuda_ops.py
diff --git a/vllm/_custom_ops/_hpu_ops.py b/vllm/_custom_ops/_hpu_ops.py
new file mode 100644
index 0000000000000..d553540f9e25a
--- /dev/null
+++ b/vllm/_custom_ops/_hpu_ops.py
@@ -0,0 +1,317 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+import os
+from typing import Optional, Tuple
+
+import habana_frameworks.torch as htorch
+import torch
+import torch.nn.functional as F
+
+import vllm.hpu.utils as hpu_utils
+
+PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
+
+
+def silu_and_mul(output, input):
+    d = input.shape[-1] // 2
+    silu = torch.nn.SiLU().to(input.device)
+    x, y = torch.split(input, d, dim=-1)
+    output.copy_(silu(x) * y)
+
+
+def fetch_from_cache(cache, blocks, permutations):
+    return [
+        cache.index_select(0, blocks[:, i]).permute(permutations)
+        for i in range(blocks.size(1))
+    ]
+
+
+def paged_attention_v1(query,
+                       key_cache,
+                       value_cache,
+                       head_mapping,
+                       scale,
+                       block_tables,
+                       context_lens,
+                       block_size,
+                       alibi_slopes=None,
+                       kv_cache_dtype=None,
+                       qk_matmul_op=torch.matmul,
+                       softmax_op=torch.softmax,
+                       av_matmul_op=torch.matmul,
+                       k_cache_cls=None,
+                       v_cache_cls=None) -> None:
+    seq_len = block_tables.size(1)
+    batch_size, query_heads, _ = query.shape
+    _, _, kv_heads, _ = key_cache.shape
+    min_inf = torch.finfo(query.dtype).min
+    mask = (torch.arange(0,
+                         seq_len * block_size,
+                         dtype=torch.int32,
+                         device=key_cache.device).view(1, -1).expand(
+                             batch_size, -1).ge(context_lens.view(-1, 1)).view(
+                                 batch_size, 1, 1, -1))
+    query.mul_(scale)
+    query = query.unsqueeze(-2)
+    fetch_keys = fetch_from_cache if k_cache_cls is None else k_cache_cls.fetch_from_cache
+    keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1))
+    if query_heads != kv_heads:
+        query = query.unflatten(1, (kv_heads, -1))
+        keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
+        mask = mask.unsqueeze(2)
+
+    attn_weights = [qk_matmul_op(query, k) for k in keys]
+    attn_weights = torch.cat(attn_weights, dim=-1)
+    if alibi_slopes is not None:
+        attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
+                                       -attn_weights.size(3):])
+    attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1)
+
+    fetch_values = fetch_from_cache if v_cache_cls is None else k_cache_cls.fetch_from_cache
+    values = fetch_values(value_cache, block_tables, (0, 2, 1, 3))
+    if PA_SPLIT_VALUE:
+        attn_weights = attn_weights.split(block_size, dim=-1)
+    else:
+        values = [torch.cat(values, dim=-2)]
+        attn_weights = [attn_weights]
+    if query_heads != kv_heads:
+        values = [v.unflatten(1, (kv_heads, 1)) for v in values]
+    attn_weights = [av_matmul_op(a, v) for a, v in zip(attn_weights, values)]
+    if query_heads != kv_heads:
+        attn_weights = [a.flatten(1, 2) for a in attn_weights]
+    attn_weights = sum(attn_weights)
+    return attn_weights.squeeze(-2)
+
+
+def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    silu_and_mul(out, x)
+    return out
+
+
+def static_fused_moe(hidden_states, w1, w2, score, topk):
+    B, D = hidden_states.shape
+    num_experts = w1.shape[0]
+    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
+    routing_weights, selected_experts = torch.topk(routing_weights,
+                                                   topk,
+                                                   dim=-1)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.to(hidden_states.dtype)
+    final_hidden_states = torch.zeros((1, B, D),
+                                      dtype=hidden_states.dtype,
+                                      device=hidden_states.device)
+    padded_weights = torch.zeros((B, num_experts),
+                                 dtype=hidden_states.dtype,
+                                 device=hidden_states.device)
+    padded_weights.scatter_(-1, selected_experts, routing_weights)
+    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
+    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+
+    htorch.core.mark_step()
+
+    for expert_idx in range(num_experts):
+        padded_weight = padded_weights[expert_idx]
+        current_state_static = hidden_states.reshape(-1, D)
+        w_output = silu_and_mul_wrapper(
+            torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
+        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
+        current_hidden_states_static = w_output * padded_weight
+        final_hidden_states += current_hidden_states_static
+        htorch.core.mark_step()
+
+    return final_hidden_states.view(-1, D)
+
+
+def prompt_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: Optional[torch.Tensor] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+    qk_matmul_op = torch.matmul,
+    softmax_op = torch.softmax,
+    av_matmul_op = torch.matmul,
+) -> torch.Tensor:
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    query_heads = query.size(1)
+    kv_heads = key.size(1)
+    if query_heads != kv_heads:
+        query = query.unflatten(1, (kv_heads, -1))
+        key = key.unflatten(1, (kv_heads, 1))
+        value = value.unflatten(1, (kv_heads, 1))
+        attn_bias = attn_bias.unsqueeze(2)
+    attn_weights = qk_matmul_op(query * scale, key.transpose(-1, -2))
+    if attn_bias is not None:
+        attn_weights.add_(attn_bias)
+    attn_weights = softmax_op(attn_weights, dim=-1)
+    attn_weights = av_matmul_op(attn_weights, value)
+    if query_heads != kv_heads:
+        attn_weights = attn_weights.flatten(1, 2)
+    attn_weights = attn_weights.transpose(1, 2)
+    return attn_weights
+
+
+
+
+def reshape_and_cache(key,
+                      value,
+                      key_cache,
+                      value_cache,
+                      slot_mapping,
+                      dtype,
+                      is_prompt=False):
+    num_blocks = key_cache.size(0)
+    block_size = key_cache.size(1)
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    offsets = torch.fmod(slot_mapping, block_size)
+    num_slots_requested = slot_mapping.size(0)
+    num_slots_available = num_blocks * block_size
+    # NOTE(kzawora): HPU PT bridge crashes with
+    # RuntimeError: Invalid inputs for scatter_nd_onnx
+    # on index_put when num_slots_requested > num_slots_available.
+    # This case might occur when we have little kv cache blocks and
+    # lots of padding, or are doing warmup.
+    # This loop is a workaround for this issue. Please remove it
+    # once key_cache.index_put_(indices, offsets), key) works.
+    num_kv_cache_passes = torch.div(num_slots_requested,
+                                    num_slots_available).ceil().int().item()
+    for i in range(num_kv_cache_passes):
+        start_idx = i * num_slots_available
+        end_idx = (i + 1) * num_slots_available
+        key_cache.index_put_(
+            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
+            key[start_idx:end_idx])
+        value_cache.index_put_(
+            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
+            value[start_idx:end_idx])
+
+
+def prepare_to_cache(cache, slot_mapping):
+    num_blocks = cache.size(0)
+    block_size = cache.size(1)
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    offsets = torch.fmod(slot_mapping, block_size)
+    num_slots_requested = slot_mapping.size(0)
+    num_slots_available = num_blocks * block_size
+    # NOTE(kzawora): HPU PT bridge crashes with
+    # RuntimeError: Invalid inputs for scatter_nd_onnx
+    # on index_put when num_slots_requested > num_slots_available.
+    # This case might occur when we have little kv cache blocks and
+    # lots of padding, or are doing warmup.
+    # This loop is a workaround for this issue. Please remove it
+    # once key_cache.index_put_(indices, offsets), key) works.
+    num_kv_cache_passes = torch.div(num_slots_requested,
+                                    num_slots_available).ceil().int().item()
+
+    return num_kv_cache_passes, num_slots_available, indices, offsets
+
+
+def insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offsets):
+    for i in range(num_kv_cache_passes):
+        start_idx = i * num_slots_available
+        end_idx = (i + 1) * num_slots_available
+        cache.index_put_(
+            (block_indices[start_idx:end_idx], block_offsets[start_idx:end_idx]),
+            input[start_idx:end_idx])
+
+
+def swap_blocks(src, dst, block_mapping):
+    index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
+    index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
+    for src_idx, dst_idx in block_mapping.items():
+        index_src[0] = src_idx
+        index_dst[0] = dst_idx
+        dst.index_put_([index_dst], src.index_select(0, index_src))
+    if dst.device.type == 'hpu':
+        htorch.core.mark_step()
+        torch.hpu.synchronize()
+
+
+def copy_blocks(key_caches, value_caches, block_mapping):
+    index_src = torch.zeros((1, ),
+                            dtype=torch.int32,
+                            device=key_caches[0].device)
+    index_dst = torch.zeros((1, ),
+                            dtype=torch.int32,
+                            device=key_caches[0].device)
+    for src, dsts in block_mapping.items():
+        index_src[0] = src
+        for dst in dsts:
+            index_dst[0] = dst
+            for key_cache in key_caches:
+                key_cache.index_copy_(0, index_dst,
+                                      key_cache.index_select(0, index_src))
+            for value_cache in value_caches:
+                value_cache.index_copy_(0, index_dst,
+                                        value_cache.index_select(0, index_src))
+        if key_caches[0].device.type == 'hpu':
+            htorch.core.mark_step()
+
+
+# fp8
+def scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    batch_dim_padding: Optional[int] = None,
+    scale_ub: Optional[torch.Tensor] = None,
+    use_per_token_if_dynamic: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    
+    """
+    Quantize input tensor to FP8 and return quantized tensor and scale.
+
+    This function supports both static and dynamic quantization: If you
+    provide the scale, it will use static scaling and if you omit it,
+    the scale will be determined dynamically. The function also allows
+    optional padding of the output tensor for downstream kernels that
+    will benefit from padding.
+
+    Args:
+        input: The input tensor to be quantized to FP8
+        scale: Optional scaling factor for the FP8 quantization
+        scale_ub: Optional upper bound for scaling factor in dynamic 
+            per token case
+        batch_dim_padding: If specified, pad the first dimension
+            of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token 
+            in the dynamic quantization case.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+            scaling factor.
+    """
+    if batch_dim_padding:
+        shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
+        output = torch.empty(shape,
+                             device=input.device,
+                             dtype=torch.float8_e4m3fn)
+    else:
+        output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    if scale is None:
+        raise "dynamic scaled_fp8_quant not implemented for HPU"
+        #TODO: calculate scale to match gaudi2 240 range instead of 448
+        if use_per_token_if_dynamic:
+            scale = torch.empty((input.numel() // input.shape[-1], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale, scale_ub)
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+    else:
+        output = torch.ops.hpu.cast_to_fp8_v2(input, 1/scale, False, False, dtype=torch.float8_e4m3fn)[0]
+
+    return output, scale
diff --git a/vllm/_ipex_ops.py b/vllm/_custom_ops/_ipex_ops.py
similarity index 100%
rename from vllm/_ipex_ops.py
rename to vllm/_custom_ops/_ipex_ops.py
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 39d00bd5733ff..badb29af1f5f6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -233,7 +233,7 @@ def _get_scheme_from_parts(
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if torch.cuda.is_available() else True
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
@@ -306,7 +306,8 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        self._check_scheme_supported(scheme.get_min_capability())
+        if torch.cuda.is_available():
+            self._check_scheme_supported(scheme.get_min_capability())
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index cc9d71db140c2..631774994b5c0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -21,7 +21,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.cutlass_fp8_supported = cutlass_fp8_supported() if torch.cuda.is_available() else False
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c829cb836ee4c..8e2ed041adf0b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -112,13 +112,18 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
-
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        capability = current_platform.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        self.use_marlin = capability < 89
+        
+        if torch.cuda.is_available():
+            self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+            # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+            # kernel for fast weight-only FP8 quantization
+            capability = current_platform.get_device_capability()
+            capability = capability[0] * 10 + capability[1]
+            self.use_marlin = capability < 89
+        else:
+            self.cutlass_fp8_supported = False
+            self.use_marlin = False
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 20100c76bd690..de5cd810b2a94 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -6,6 +6,8 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+if current_platform.is_hpu():
+    import habana_frameworks.torch.utils.experimental as htexp
 
 
 def cutlass_fp8_supported() -> bool:
@@ -18,8 +20,17 @@ def cutlass_fp8_supported() -> bool:
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
-    fake_qweight = tensor.to(torch.float16)
+    dtype = torch.float16
+    device = tensor.device
+    if current_platform.is_hpu():
+        dtype = torch.bfloat16
+        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:    
+        #dequant on cpu to avoid nan on gaudi2
+            tensor = tensor.to('cpu')
+    
+    fake_qweight = tensor.to(dtype).to(device)
     dq_weight = fake_qweight * inv_scale
+    
     return dq_weight
 
 
@@ -76,6 +87,9 @@ def requantize_with_max_scale(
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
+    if current_platform.is_hpu() and htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
+        max_w_scale = max_w_scale * (448/240)
+        
 
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
@@ -147,12 +161,25 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            output, _ = torch._scaled_mm(qinput,
-                                         weight,
-                                         out_dtype=input.dtype,
-                                         scale_a=x_scale,
-                                         scale_b=weight_scale,
-                                         bias=bias)
+            if current_platform.is_hpu():
+                #hpu does not support torch._scaled_mm (SW-197036)
+                output = torch.ops.hpu.fp8_gemm_v2(qinput,
+                                        False,
+                                        weight,
+                                        False,
+                                        None,
+                                        input.dtype,
+                                        x_scale,
+                                        weight_scale,
+                                        None,
+                                        False)
+            else:
+                output, _ = torch._scaled_mm(qinput,
+                                             weight,
+                                             out_dtype=input.dtype,
+                                             scale_a=x_scale,
+                                             scale_b=weight_scale,
+                                             bias=bias)
             return torch.narrow(output, 0, 0, input.shape[0])
 
         else:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 676a51ce67f96..f02609aa9ff3b 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -54,6 +54,9 @@
 
 from .interfaces import SupportsLoRA
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+from vllm.platforms import current_platform
+if current_platform.is_hpu():
+    import habana_frameworks.torch.core as htcore
 
 
 class LlamaMLP(nn.Module):
@@ -518,6 +521,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
 
+            #Avoid OOM due to large graph when loading weights
+            if current_platform.is_hpu():
+                htcore.mark_step()
+
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
     # make sure to leave KV cache scale factors in a known good (dummy) state
diff --git a/vllm/utils.py b/vllm/utils.py
index fa6e132dd3522..661d5d62e069b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -176,69 +176,25 @@ def clear(self):
 
 
 def is_hip() -> bool:
-    return torch.version.hip is not None
+    return ops.is_hip()
 
-
-@lru_cache(maxsize=None)
 def is_cpu() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "cpu" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
+    return ops.is_cpu()
 
-@lru_cache(maxsize=None)
 def is_openvino() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "openvino" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
+    return ops.is_openvino()
 
-@lru_cache(maxsize=None)
 def is_neuron() -> bool:
-    try:
-        import transformers_neuronx
-    except ImportError:
-        transformers_neuronx = None
-    return transformers_neuronx is not None
+    return ops.is_neuron()
 
-
-@lru_cache(maxsize=None)
 def is_hpu() -> bool:
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
+    return ops.is_hpu()
 
-@lru_cache(maxsize=None)
 def is_tpu() -> bool:
-    try:
-        import libtpu
-    except ImportError:
-        libtpu = None
-    return libtpu is not None
+    return ops.is_tpu()
 
-
-@lru_cache(maxsize=None)
 def is_xpu() -> bool:
-    from importlib.metadata import version
-    is_xpu_flag = "xpu" in version("vllm")
-    # vllm is not build with xpu
-    if not is_xpu_flag:
-        return False
-    try:
-        import intel_extension_for_pytorch as ipex  # noqa: F401
-        _import_ipex = True
-    except ImportError as e:
-        logger.warning("Import Error for IPEX: %s", e.msg)
-        _import_ipex = False
-    # ipex dependency is not ready
-    if not _import_ipex:
-        logger.warning("not found ipex lib")
-        return False
-    return hasattr(torch, "xpu") and torch.xpu.is_available()
+    return ops.is_xpu()
 
 
 @lru_cache(maxsize=None)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a975dba6f5136..a2c7a96757faa 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -453,8 +453,7 @@ def __init__(
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        if self.model_config.quantization == 'inc':
-            htcore.hpu_set_env()
+        htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 9d083915041fe..bf285c93cdd47 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -109,8 +109,7 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        if self.model_config.quantization == 'inc':
-            self._set_env_vars()
+        self._set_env_vars()
         init_worker_distributed_environment(self.parallel_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)

From 221eb5600f7523c957ebad318e54d908af6c8332 Mon Sep 17 00:00:00 2001
From: Liran Bachar <lbachar@habana.ai>
Date: Sun, 1 Sep 2024 13:57:58 +0300
Subject: [PATCH 143/341] Revert "support loading autofp8 checkpoint"

This reverts commit a032ea2781583756f1fca8bdaa6284fa2693b841.
---
 .../_cuda_ops.py => _custom_ops.py}           |   0
 vllm/_custom_ops/__init__.py                  |  75 -----
 vllm/_custom_ops/_hpu_ops.py                  | 317 ------------------
 vllm/{_custom_ops => }/_ipex_ops.py           |   0
 .../compressed_tensors/compressed_tensors.py  |   5 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    |   2 +-
 .../model_executor/layers/quantization/fp8.py |  19 +-
 .../layers/quantization/utils/w8a8_utils.py   |  41 +--
 vllm/model_executor/models/llama.py           |   7 -
 vllm/utils.py                                 |  58 +++-
 vllm/worker/habana_model_runner.py            |   3 +-
 vllm/worker/habana_worker.py                  |   3 +-
 12 files changed, 72 insertions(+), 458 deletions(-)
 rename vllm/{_custom_ops/_cuda_ops.py => _custom_ops.py} (100%)
 delete mode 100644 vllm/_custom_ops/__init__.py
 delete mode 100644 vllm/_custom_ops/_hpu_ops.py
 rename vllm/{_custom_ops => }/_ipex_ops.py (100%)

diff --git a/vllm/_custom_ops/_cuda_ops.py b/vllm/_custom_ops.py
similarity index 100%
rename from vllm/_custom_ops/_cuda_ops.py
rename to vllm/_custom_ops.py
diff --git a/vllm/_custom_ops/__init__.py b/vllm/_custom_ops/__init__.py
deleted file mode 100644
index 2411a1465c187..0000000000000
--- a/vllm/_custom_ops/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-
-from functools import lru_cache
-
-@lru_cache(maxsize=None)
-def is_hip() -> bool:
-    return torch.version.hip is not None
-
-
-@lru_cache(maxsize=None)
-def is_cpu() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "cpu" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
-
-@lru_cache(maxsize=None)
-def is_openvino() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "openvino" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
-
-@lru_cache(maxsize=None)
-def is_neuron() -> bool:
-    try:
-        import transformers_neuronx
-    except ImportError:
-        transformers_neuronx = None
-    return transformers_neuronx is not None
-
-
-@lru_cache(maxsize=None)
-def is_hpu() -> bool:
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
-
-@lru_cache(maxsize=None)
-def is_tpu() -> bool:
-    try:
-        import libtpu
-    except ImportError:
-        libtpu = None
-    return libtpu is not None
-
-
-@lru_cache(maxsize=None)
-def is_xpu() -> bool:
-    from importlib.metadata import version
-    is_xpu_flag = "xpu" in version("vllm")
-    # vllm is not build with xpu
-    if not is_xpu_flag:
-        return False
-    try:
-        import intel_extension_for_pytorch as ipex  # noqa: F401
-        _import_ipex = True
-    except ImportError as e:
-        logger.warning("Import Error for IPEX: %s", e.msg)
-        _import_ipex = False
-    # ipex dependency is not ready
-    if not _import_ipex:
-        logger.warning("not found ipex lib")
-        return False
-    return hasattr(torch, "xpu") and torch.xpu.is_available()
-
-if is_xpu():
-    from ._ipex_ops import *
-elif is_hpu():
-    from ._hpu_ops import *
-else:
-    from ._cuda_ops import *
\ No newline at end of file
diff --git a/vllm/_custom_ops/_hpu_ops.py b/vllm/_custom_ops/_hpu_ops.py
deleted file mode 100644
index d553540f9e25a..0000000000000
--- a/vllm/_custom_ops/_hpu_ops.py
+++ /dev/null
@@ -1,317 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-import os
-from typing import Optional, Tuple
-
-import habana_frameworks.torch as htorch
-import torch
-import torch.nn.functional as F
-
-import vllm.hpu.utils as hpu_utils
-
-PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
-
-
-def silu_and_mul(output, input):
-    d = input.shape[-1] // 2
-    silu = torch.nn.SiLU().to(input.device)
-    x, y = torch.split(input, d, dim=-1)
-    output.copy_(silu(x) * y)
-
-
-def fetch_from_cache(cache, blocks, permutations):
-    return [
-        cache.index_select(0, blocks[:, i]).permute(permutations)
-        for i in range(blocks.size(1))
-    ]
-
-
-def paged_attention_v1(query,
-                       key_cache,
-                       value_cache,
-                       head_mapping,
-                       scale,
-                       block_tables,
-                       context_lens,
-                       block_size,
-                       alibi_slopes=None,
-                       kv_cache_dtype=None,
-                       qk_matmul_op=torch.matmul,
-                       softmax_op=torch.softmax,
-                       av_matmul_op=torch.matmul,
-                       k_cache_cls=None,
-                       v_cache_cls=None) -> None:
-    seq_len = block_tables.size(1)
-    batch_size, query_heads, _ = query.shape
-    _, _, kv_heads, _ = key_cache.shape
-    min_inf = torch.finfo(query.dtype).min
-    mask = (torch.arange(0,
-                         seq_len * block_size,
-                         dtype=torch.int32,
-                         device=key_cache.device).view(1, -1).expand(
-                             batch_size, -1).ge(context_lens.view(-1, 1)).view(
-                                 batch_size, 1, 1, -1))
-    query.mul_(scale)
-    query = query.unsqueeze(-2)
-    fetch_keys = fetch_from_cache if k_cache_cls is None else k_cache_cls.fetch_from_cache
-    keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1))
-    if query_heads != kv_heads:
-        query = query.unflatten(1, (kv_heads, -1))
-        keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
-        mask = mask.unsqueeze(2)
-
-    attn_weights = [qk_matmul_op(query, k) for k in keys]
-    attn_weights = torch.cat(attn_weights, dim=-1)
-    if alibi_slopes is not None:
-        attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
-                                       -attn_weights.size(3):])
-    attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1)
-
-    fetch_values = fetch_from_cache if v_cache_cls is None else k_cache_cls.fetch_from_cache
-    values = fetch_values(value_cache, block_tables, (0, 2, 1, 3))
-    if PA_SPLIT_VALUE:
-        attn_weights = attn_weights.split(block_size, dim=-1)
-    else:
-        values = [torch.cat(values, dim=-2)]
-        attn_weights = [attn_weights]
-    if query_heads != kv_heads:
-        values = [v.unflatten(1, (kv_heads, 1)) for v in values]
-    attn_weights = [av_matmul_op(a, v) for a, v in zip(attn_weights, values)]
-    if query_heads != kv_heads:
-        attn_weights = [a.flatten(1, 2) for a in attn_weights]
-    attn_weights = sum(attn_weights)
-    return attn_weights.squeeze(-2)
-
-
-def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    output_shape = (x.shape[:-1] + (d, ))
-    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-    silu_and_mul(out, x)
-    return out
-
-
-def static_fused_moe(hidden_states, w1, w2, score, topk):
-    B, D = hidden_states.shape
-    num_experts = w1.shape[0]
-    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-    routing_weights, selected_experts = torch.topk(routing_weights,
-                                                   topk,
-                                                   dim=-1)
-    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-    routing_weights = routing_weights.to(hidden_states.dtype)
-    final_hidden_states = torch.zeros((1, B, D),
-                                      dtype=hidden_states.dtype,
-                                      device=hidden_states.device)
-    padded_weights = torch.zeros((B, num_experts),
-                                 dtype=hidden_states.dtype,
-                                 device=hidden_states.device)
-    padded_weights.scatter_(-1, selected_experts, routing_weights)
-    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
-    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-
-    htorch.core.mark_step()
-
-    for expert_idx in range(num_experts):
-        padded_weight = padded_weights[expert_idx]
-        current_state_static = hidden_states.reshape(-1, D)
-        w_output = silu_and_mul_wrapper(
-            torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1)))
-        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
-        current_hidden_states_static = w_output * padded_weight
-        final_hidden_states += current_hidden_states_static
-        htorch.core.mark_step()
-
-    return final_hidden_states.view(-1, D)
-
-
-def prompt_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_bias: Optional[torch.Tensor] = None,
-    p: float = 0.0,
-    scale: Optional[float] = None,
-    qk_matmul_op = torch.matmul,
-    softmax_op = torch.softmax,
-    av_matmul_op = torch.matmul,
-) -> torch.Tensor:
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-    query_heads = query.size(1)
-    kv_heads = key.size(1)
-    if query_heads != kv_heads:
-        query = query.unflatten(1, (kv_heads, -1))
-        key = key.unflatten(1, (kv_heads, 1))
-        value = value.unflatten(1, (kv_heads, 1))
-        attn_bias = attn_bias.unsqueeze(2)
-    attn_weights = qk_matmul_op(query * scale, key.transpose(-1, -2))
-    if attn_bias is not None:
-        attn_weights.add_(attn_bias)
-    attn_weights = softmax_op(attn_weights, dim=-1)
-    attn_weights = av_matmul_op(attn_weights, value)
-    if query_heads != kv_heads:
-        attn_weights = attn_weights.flatten(1, 2)
-    attn_weights = attn_weights.transpose(1, 2)
-    return attn_weights
-
-
-
-
-def reshape_and_cache(key,
-                      value,
-                      key_cache,
-                      value_cache,
-                      slot_mapping,
-                      dtype,
-                      is_prompt=False):
-    num_blocks = key_cache.size(0)
-    block_size = key_cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = torch.div(num_slots_requested,
-                                    num_slots_available).ceil().int().item()
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        key_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            key[start_idx:end_idx])
-        value_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            value[start_idx:end_idx])
-
-
-def prepare_to_cache(cache, slot_mapping):
-    num_blocks = cache.size(0)
-    block_size = cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = torch.div(num_slots_requested,
-                                    num_slots_available).ceil().int().item()
-
-    return num_kv_cache_passes, num_slots_available, indices, offsets
-
-
-def insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offsets):
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        cache.index_put_(
-            (block_indices[start_idx:end_idx], block_offsets[start_idx:end_idx]),
-            input[start_idx:end_idx])
-
-
-def swap_blocks(src, dst, block_mapping):
-    index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
-    index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
-    for src_idx, dst_idx in block_mapping.items():
-        index_src[0] = src_idx
-        index_dst[0] = dst_idx
-        dst.index_put_([index_dst], src.index_select(0, index_src))
-    if dst.device.type == 'hpu':
-        htorch.core.mark_step()
-        torch.hpu.synchronize()
-
-
-def copy_blocks(key_caches, value_caches, block_mapping):
-    index_src = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    index_dst = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    for src, dsts in block_mapping.items():
-        index_src[0] = src
-        for dst in dsts:
-            index_dst[0] = dst
-            for key_cache in key_caches:
-                key_cache.index_copy_(0, index_dst,
-                                      key_cache.index_select(0, index_src))
-            for value_cache in value_caches:
-                value_cache.index_copy_(0, index_dst,
-                                        value_cache.index_select(0, index_src))
-        if key_caches[0].device.type == 'hpu':
-            htorch.core.mark_step()
-
-
-# fp8
-def scaled_fp8_quant(
-    input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    batch_dim_padding: Optional[int] = None,
-    scale_ub: Optional[torch.Tensor] = None,
-    use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    
-    """
-    Quantize input tensor to FP8 and return quantized tensor and scale.
-
-    This function supports both static and dynamic quantization: If you
-    provide the scale, it will use static scaling and if you omit it,
-    the scale will be determined dynamically. The function also allows
-    optional padding of the output tensor for downstream kernels that
-    will benefit from padding.
-
-    Args:
-        input: The input tensor to be quantized to FP8
-        scale: Optional scaling factor for the FP8 quantization
-        scale_ub: Optional upper bound for scaling factor in dynamic 
-            per token case
-        batch_dim_padding: If specified, pad the first dimension
-            of the output to at least this value.
-        use_per_token_if_dynamic: Whether to do per_tensor or per_token 
-            in the dynamic quantization case.
-
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
-            scaling factor.
-    """
-    if batch_dim_padding:
-        shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
-        output = torch.empty(shape,
-                             device=input.device,
-                             dtype=torch.float8_e4m3fn)
-    else:
-        output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    if scale is None:
-        raise "dynamic scaled_fp8_quant not implemented for HPU"
-        #TODO: calculate scale to match gaudi2 240 range instead of 448
-        if use_per_token_if_dynamic:
-            scale = torch.empty((input.numel() // input.shape[-1], 1),
-                                device=input.device,
-                                dtype=torch.float32)
-            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-                output, input, scale, scale_ub)
-        else:
-            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
-    else:
-        output = torch.ops.hpu.cast_to_fp8_v2(input, 1/scale, False, False, dtype=torch.float8_e4m3fn)[0]
-
-    return output, scale
diff --git a/vllm/_custom_ops/_ipex_ops.py b/vllm/_ipex_ops.py
similarity index 100%
rename from vllm/_custom_ops/_ipex_ops.py
rename to vllm/_ipex_ops.py
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index badb29af1f5f6..39d00bd5733ff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -233,7 +233,7 @@ def _get_scheme_from_parts(
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if torch.cuda.is_available() else True
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
@@ -306,8 +306,7 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        if torch.cuda.is_available():
-            self._check_scheme_supported(scheme.get_min_capability())
+        self._check_scheme_supported(scheme.get_min_capability())
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 631774994b5c0..cc9d71db140c2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -21,7 +21,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported() if torch.cuda.is_available() else False
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8e2ed041adf0b..c829cb836ee4c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -112,18 +112,13 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        
-        if torch.cuda.is_available():
-            self.cutlass_fp8_supported = cutlass_fp8_supported()
-
-            # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-            # kernel for fast weight-only FP8 quantization
-            capability = current_platform.get_device_capability()
-            capability = capability[0] * 10 + capability[1]
-            self.use_marlin = capability < 89
-        else:
-            self.cutlass_fp8_supported = False
-            self.use_marlin = False
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        self.use_marlin = capability < 89
 
     def create_weights(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index de5cd810b2a94..20100c76bd690 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -6,8 +6,6 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-if current_platform.is_hpu():
-    import habana_frameworks.torch.utils.experimental as htexp
 
 
 def cutlass_fp8_supported() -> bool:
@@ -20,17 +18,8 @@ def cutlass_fp8_supported() -> bool:
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
-    dtype = torch.float16
-    device = tensor.device
-    if current_platform.is_hpu():
-        dtype = torch.bfloat16
-        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:    
-        #dequant on cpu to avoid nan on gaudi2
-            tensor = tensor.to('cpu')
-    
-    fake_qweight = tensor.to(dtype).to(device)
+    fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
-    
     return dq_weight
 
 
@@ -87,9 +76,6 @@ def requantize_with_max_scale(
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
-    if current_platform.is_hpu() and htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
-        max_w_scale = max_w_scale * (448/240)
-        
 
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
@@ -161,25 +147,12 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            if current_platform.is_hpu():
-                #hpu does not support torch._scaled_mm (SW-197036)
-                output = torch.ops.hpu.fp8_gemm_v2(qinput,
-                                        False,
-                                        weight,
-                                        False,
-                                        None,
-                                        input.dtype,
-                                        x_scale,
-                                        weight_scale,
-                                        None,
-                                        False)
-            else:
-                output, _ = torch._scaled_mm(qinput,
-                                             weight,
-                                             out_dtype=input.dtype,
-                                             scale_a=x_scale,
-                                             scale_b=weight_scale,
-                                             bias=bias)
+            output, _ = torch._scaled_mm(qinput,
+                                         weight,
+                                         out_dtype=input.dtype,
+                                         scale_a=x_scale,
+                                         scale_b=weight_scale,
+                                         bias=bias)
             return torch.narrow(output, 0, 0, input.shape[0])
 
         else:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f02609aa9ff3b..676a51ce67f96 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -54,9 +54,6 @@
 
 from .interfaces import SupportsLoRA
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
-from vllm.platforms import current_platform
-if current_platform.is_hpu():
-    import habana_frameworks.torch.core as htcore
 
 
 class LlamaMLP(nn.Module):
@@ -521,10 +518,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
 
-            #Avoid OOM due to large graph when loading weights
-            if current_platform.is_hpu():
-                htcore.mark_step()
-
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
     # make sure to leave KV cache scale factors in a known good (dummy) state
diff --git a/vllm/utils.py b/vllm/utils.py
index 661d5d62e069b..fa6e132dd3522 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -176,25 +176,69 @@ def clear(self):
 
 
 def is_hip() -> bool:
-    return ops.is_hip()
+    return torch.version.hip is not None
 
+
+@lru_cache(maxsize=None)
 def is_cpu() -> bool:
-    return ops.is_cpu()
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "cpu" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
 
+@lru_cache(maxsize=None)
 def is_openvino() -> bool:
-    return ops.is_openvino()
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "openvino" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
 
+@lru_cache(maxsize=None)
 def is_neuron() -> bool:
-    return ops.is_neuron()
+    try:
+        import transformers_neuronx
+    except ImportError:
+        transformers_neuronx = None
+    return transformers_neuronx is not None
 
+
+@lru_cache(maxsize=None)
 def is_hpu() -> bool:
-    return ops.is_hpu()
+    from importlib import util
+    return util.find_spec('habana_frameworks') is not None
+
 
+@lru_cache(maxsize=None)
 def is_tpu() -> bool:
-    return ops.is_tpu()
+    try:
+        import libtpu
+    except ImportError:
+        libtpu = None
+    return libtpu is not None
 
+
+@lru_cache(maxsize=None)
 def is_xpu() -> bool:
-    return ops.is_xpu()
+    from importlib.metadata import version
+    is_xpu_flag = "xpu" in version("vllm")
+    # vllm is not build with xpu
+    if not is_xpu_flag:
+        return False
+    try:
+        import intel_extension_for_pytorch as ipex  # noqa: F401
+        _import_ipex = True
+    except ImportError as e:
+        logger.warning("Import Error for IPEX: %s", e.msg)
+        _import_ipex = False
+    # ipex dependency is not ready
+    if not _import_ipex:
+        logger.warning("not found ipex lib")
+        return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
 
 
 @lru_cache(maxsize=None)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a2c7a96757faa..a975dba6f5136 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -453,7 +453,8 @@ def __init__(
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        htcore.hpu_set_env()
+        if self.model_config.quantization == 'inc':
+            htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index bf285c93cdd47..9d083915041fe 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -109,7 +109,8 @@ def init_device(self) -> None:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
         # Initialize the distributed environment.
-        self._set_env_vars()
+        if self.model_config.quantization == 'inc':
+            self._set_env_vars()
         init_worker_distributed_environment(self.parallel_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)

From c899aef31c064523daa5c38746d203dc148518cc Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Mon, 2 Sep 2024 12:54:54 +0300
Subject: [PATCH 144/341] warmup_mode kward restore

---
 vllm/worker/habana_model_runner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 0100076aec8e2..241980f32f097 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1615,7 +1615,10 @@ def execute_model(
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
+            execute_model_kwargs.update({
+                "bypass_hpu_graphs": not use_graphs,
+                "warmup_mode": warmup_mode
+            })
 
         htorch.core.mark_step()
         if self.is_driver_worker:

From 4eedfb91c8ef33a601b9e203a7ad8048d854222f Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai>
Date: Mon, 2 Sep 2024 14:24:59 +0300
Subject: [PATCH 145/341] change format

---
 vllm/worker/habana_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 241980f32f097..dec1b65858eb4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1567,8 +1567,8 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
         self.seen_configs.add(cfg)
         if not seen and not warmup_mode:
             phase = 'prompt' if is_prompt else 'decode'
-            logger.warning('Configuration: (', phase, ', ', batch_size, ', ',
-                           seq_len, ') was not warmed-up!')
+            logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
+                           phase, batch_size, seq_len)
 
     @torch.inference_mode()
     def execute_model(

From 1dccf88380cbbb1c73e033009b2485f39cf6fde1 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 21 Aug 2024 13:43:53 +0300
Subject: [PATCH 146/341] POC for bgmv

---
 vllm/decode.py                     |  3 +++
 vllm/hpu/ops.py                    | 30 ++++++++++++++++++++++++------
 vllm/worker/habana_model_runner.py |  3 +++
 3 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100644 vllm/decode.py

diff --git a/vllm/decode.py b/vllm/decode.py
new file mode 100644
index 0000000000000..bab140559e321
--- /dev/null
+++ b/vllm/decode.py
@@ -0,0 +1,3 @@
+def init():
+    global is_decode 
+    is_decode = False
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 662c53486b4ca..67f6bef2ab9a4 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from vllm.logger import init_logger
+import vllm.decode as decode
 
 logger = init_logger(__name__)
 HPUFusedRMSNorm = None
@@ -222,13 +223,30 @@ def dispatch_bgmv_linear(
     max_loras = wa_t_all.size(0)
     # Wrap-around for negative indices
     indices = indices % max_loras
-    wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
-    wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+    if decode.is_decode:
+        wa = wa_t_all[:, 0, :, :].transpose(0, 2)
+        wb = wb_t_all[:, 0, :, :].transpose(1, 2)
+        wa_shape = wa.shape
+        wb_shape = wb.shape
+        wa = wa.reshape(wa_shape[0], wa_shape[1] * wa_shape[2])
+        wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
+        out = x @ wa
+        mask = torch.zeros(out.shape[0], out.shape[1], dtype=out.dtype)
+        for i in range(out.shape[0]):
+            if indices[i] < 0:
+                continue
+            start_pos = indices[i] * wa_shape[1]
+            mask[i, start_pos : start_pos : start_pos + wa_shape[1]] = 1
+        out = out * mask.to('hpu')
+        out = out@wb
+    else:
+        wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+        wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
 
-    x = x.unsqueeze(1)
-    out = x @ wa
-    out = out @ wb
-    out = out.squeeze(1)
+        x = x.unsqueeze(1)
+        out = x @ wa
+        out = out @ wb
+        out = out.squeeze(1)
     y += out * scale
 
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index dec1b65858eb4..d81aea05caecd 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -33,6 +33,7 @@
                            SequenceGroupMetadata)
 from vllm.utils import (HabanaMemoryProfiler, format_bytes,
                         is_pin_memory_available, make_tensor_with_pad)
+import vllm.decode as decode
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -627,6 +628,7 @@ def _prepare_prompt(
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
+        decode.is_decode = False
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -834,6 +836,7 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
+        decode.is_decode = True
 
         for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt

From c8e49552730c30d5f07d085694b72e597edef089 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Mon, 26 Aug 2024 15:15:52 +0300
Subject: [PATCH 147/341] Prompt mask implementation

---
 vllm/decode.py                     |  4 ++--
 vllm/hpu/ops.py                    | 18 +++++----------
 vllm/worker/habana_model_runner.py | 36 +++++++++++++++++++++++++++---
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/vllm/decode.py b/vllm/decode.py
index bab140559e321..1cf8ea1cdbe11 100644
--- a/vllm/decode.py
+++ b/vllm/decode.py
@@ -1,3 +1,3 @@
 def init():
-    global is_decode 
-    is_decode = False
\ No newline at end of file
+    global mask
+    mask = None
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 67f6bef2ab9a4..ed3fab733bc59 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -223,21 +223,15 @@ def dispatch_bgmv_linear(
     max_loras = wa_t_all.size(0)
     # Wrap-around for negative indices
     indices = indices % max_loras
-    if decode.is_decode:
-        wa = wa_t_all[:, 0, :, :].transpose(0, 2)
-        wb = wb_t_all[:, 0, :, :].transpose(1, 2)
+    if decode.mask is not None:
+        wa = wa_t_all[:, 0, :, :]
+        wb = wb_t_all[:, 0, :, :].transpose(0, 1)
         wa_shape = wa.shape
         wb_shape = wb.shape
-        wa = wa.reshape(wa_shape[0], wa_shape[1] * wa_shape[2])
-        wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
+        wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
+        wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1)
         out = x @ wa
-        mask = torch.zeros(out.shape[0], out.shape[1], dtype=out.dtype)
-        for i in range(out.shape[0]):
-            if indices[i] < 0:
-                continue
-            start_pos = indices[i] * wa_shape[1]
-            mask[i, start_pos : start_pos : start_pos + wa_shape[1]] = 1
-        out = out * mask.to('hpu')
+        out = out * decode.mask
         out = out@wb
     else:
         wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d81aea05caecd..e78cf86a54ad1 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -628,7 +628,7 @@ def _prepare_prompt(
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
-        decode.is_decode = False
+        decode.mask = None
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -741,18 +741,33 @@ def _prepare_prompt(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
 
+        if self.lora_config:
+            decode.mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
+                                      (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
+                                      dtype=self.lora_config.lora_dtype)
+            ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
+            counter = 0
         for seq_group_metadata, context_len in zip(seq_group_metadata_list,
                                                    context_lens):
             lora_id = seq_group_metadata.lora_int_id
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
+                start_row = counter * max_prompt_len
+                end_row = start_row + max_prompt_len
+                start_col = (lora_id - 1) * self.lora_config.max_lora_rank
+                end_col = start_col + self.lora_config.max_lora_rank
+                decode.mask[start_row:end_row, start_col:end_col] = ones
+                counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
             lora_prompt_mapping.extend(
                 [lora_id] *
                 (max_prompt_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
+            
+        if decode.mask is not None:
+            decode.mask = decode.mask.to('hpu')
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -836,7 +851,14 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
-        decode.is_decode = True
+        decode.mask = None
+
+        if self.lora_config:
+            decode.mask = torch.zeros(len(seq_group_metadata_list),
+                                      (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
+                                      dtype=self.lora_config.lora_dtype)
+            ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
+            counter = 0
 
         for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
@@ -847,6 +869,10 @@ def _prepare_decode(
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
+                start_pos = (lora_id - 1) * self.lora_config.max_lora_rank
+                end_pos = start_pos + self.lora_config.max_lora_rank
+                decode.mask[counter, start_pos:end_pos] = ones
+                counter = counter + 1
 
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
@@ -875,6 +901,8 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
+        if decode.mask is not None:
+            decode.mask = decode.mask.to('hpu')
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -1152,6 +1180,7 @@ def profile_run(self) -> None:
                              True,
                              kv_caches,
                              is_profile_run=True)
+        return
 
     def warmup_scenario(self,
                         batch_size,
@@ -1206,7 +1235,7 @@ def warmup_scenario(self,
                 if dummy_lora_requests_per_seq else None)
             for i in range(batch_size)
         ]
-        torch.hpu.synchronize()
+        #torch.hpu.synchronize()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
             self.execute_model(inputs, kv_caches, warmup_mode=True)
@@ -1647,6 +1676,7 @@ def execute_model(
                         module.indices_len[
                             i] = sampling_metadata.selected_token_indices.numel(
                             )
+            decode.mask = None
 
         # Compute the logits.
         with self.profiler.record_event(

From 9df71977a62da0f5c0bd475c96112d32d2b7afbd Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 28 Aug 2024 11:09:40 +0300
Subject: [PATCH 148/341] Multi Lora Fix

---
 vllm/hpu/ops.py                    | 3 ++-
 vllm/worker/habana_model_runner.py | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index ed3fab733bc59..9e17339b0e574 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -222,7 +222,6 @@ def dispatch_bgmv_linear(
     assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
     max_loras = wa_t_all.size(0)
     # Wrap-around for negative indices
-    indices = indices % max_loras
     if decode.mask is not None:
         wa = wa_t_all[:, 0, :, :]
         wb = wb_t_all[:, 0, :, :].transpose(0, 1)
@@ -231,9 +230,11 @@ def dispatch_bgmv_linear(
         wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
         wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1)
         out = x @ wa
+        assert(out.shape == decode.mask.shape)
         out = out * decode.mask
         out = out@wb
     else:
+        indices = indices % max_loras
         wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
         wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e78cf86a54ad1..152d7f3000572 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -758,7 +758,7 @@ def _prepare_prompt(
                 start_col = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_col = start_col + self.lora_config.max_lora_rank
                 decode.mask[start_row:end_row, start_col:end_col] = ones
-                counter = counter + 1
+            counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
             lora_prompt_mapping.extend(
@@ -872,7 +872,7 @@ def _prepare_decode(
                 start_pos = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_pos = start_pos + self.lora_config.max_lora_rank
                 decode.mask[counter, start_pos:end_pos] = ones
-                counter = counter + 1
+            counter = counter + 1
 
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
@@ -1180,7 +1180,6 @@ def profile_run(self) -> None:
                              True,
                              kv_caches,
                              is_profile_run=True)
-        return
 
     def warmup_scenario(self,
                         batch_size,

From 234ffdc637eac9707c1afce7766f8d2445dc5289 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 28 Aug 2024 13:51:41 +0300
Subject: [PATCH 149/341] HPU Graph Fix for decode mask

---
 vllm/worker/habana_model_runner.py | 47 +++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 152d7f3000572..9d7881768a0d0 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -230,10 +230,11 @@ def forward(self, *args, **kwargs):
                                                       input_ids.size(1),
                                                       input_ids.device,
                                                       torch.bfloat16)
+        decode.mask = kwargs.pop('mask')
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)
-        return hidden_states
+        return hidden_states, decode.mask
 
     def compute_logits(self, *args, **kwargs):
         return self.model.compute_logits(*args, **kwargs)
@@ -253,6 +254,7 @@ class PreparePromptMetadata(NamedTuple):
     lora_requests: Set[LoRARequest]
     multi_modal_input: Optional[torch.Tensor]
     slot_mapping: List[List[int]]
+    mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
@@ -267,6 +269,7 @@ def empty(cls):
             lora_requests=set(),
             multi_modal_input=None,
             slot_mapping=[],
+            mask = None
         )
 
 
@@ -278,6 +281,7 @@ class PrepareDecodeMetadata(NamedTuple):
     lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
     slot_mapping: List[List[int]]
+    mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
@@ -289,6 +293,7 @@ def empty(cls):
             lora_prompt_mapping=[],
             lora_requests=set(),
             slot_mapping=[],
+            mask=None,
         )
 
 
@@ -324,6 +329,7 @@ class ModelInputForHPU(ModelRunnerInputBase):
     real_batch_size: Optional[int] = None
     batch_size_padded: Optional[int] = None
     virtual_engine: int = 0
+    mask: Optional[torch.Tensor] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -334,7 +340,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "multi_modal_kwargs": self.multi_modal_kwargs,
             "real_batch_size": self.real_batch_size,
             "batch_size_padded": self.batch_size_padded,
-            "virtual_engine": self.virtual_engine
+            "virtual_engine": self.virtual_engine,
+            "mask": mask
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -368,6 +375,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
+            "mask": self.mask
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -628,7 +636,7 @@ def _prepare_prompt(
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
-        decode.mask = None
+        mask = None
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -742,7 +750,7 @@ def _prepare_prompt(
             self.block_size)
 
         if self.lora_config:
-            decode.mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
+            mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
                                       (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
                                       dtype=self.lora_config.lora_dtype)
             ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
@@ -757,7 +765,7 @@ def _prepare_prompt(
                 end_row = start_row + max_prompt_len
                 start_col = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_col = start_col + self.lora_config.max_lora_rank
-                decode.mask[start_row:end_row, start_col:end_col] = ones
+                mask[start_row:end_row, start_col:end_col] = ones
             counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
@@ -766,8 +774,8 @@ def _prepare_prompt(
                 (max_prompt_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
             
-        if decode.mask is not None:
-            decode.mask = decode.mask.to('hpu')
+        if mask is not None:
+            mask = mask.to('hpu')
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -834,6 +842,7 @@ def _prepare_prompt(
             lora_requests=lora_requests,
             multi_modal_input=multi_modal_input,
             slot_mapping=slot_mapping,
+            mask=mask,
         )
 
     def _prepare_decode(
@@ -851,10 +860,10 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
-        decode.mask = None
+        mask = None
 
         if self.lora_config:
-            decode.mask = torch.zeros(len(seq_group_metadata_list),
+            mask = torch.zeros(len(seq_group_metadata_list),
                                       (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
                                       dtype=self.lora_config.lora_dtype)
             ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
@@ -871,7 +880,7 @@ def _prepare_decode(
                 lora_requests.add(seq_group_metadata.lora_request)
                 start_pos = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_pos = start_pos + self.lora_config.max_lora_rank
-                decode.mask[counter, start_pos:end_pos] = ones
+                mask[counter, start_pos:end_pos] = ones
             counter = counter + 1
 
             for seq_id in seq_ids:
@@ -901,8 +910,8 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        if decode.mask is not None:
-            decode.mask = decode.mask.to('hpu')
+        if mask is not None:
+            mask = mask.to('hpu')
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -948,6 +957,7 @@ def _prepare_decode(
             lora_prompt_mapping=lora_prompt_mapping,
             lora_requests=lora_requests,
             slot_mapping=slot_mapping,
+            mask=mask,
         )
 
     def prepare_input_tensors(
@@ -1002,6 +1012,7 @@ def prepare_input_tensors(
             lora_requests,
             multi_modal_input,
             slot_mapping,
+            mask,
         ) = self._prepare_prompt(prefill_reqs)
         (
             decode_input_tokens,
@@ -1011,6 +1022,7 @@ def prepare_input_tensors(
             decode_lora_prompt_mapping,
             decode_lora_requests,
             decode_slot_mapping,
+            decode_mask,
         ) = self._prepare_decode(decode_reqs)
         sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
                                                      seq_lens, query_lens,
@@ -1037,6 +1049,7 @@ def prepare_input_tensors(
             lora_index_mapping = decode_lora_index_mapping
             lora_prompt_mapping = decode_lora_prompt_mapping
             lora_requests = decode_lora_requests
+            mask = decode_mask
 
         # FIXME: We need to adjust selected_token_indices to accommodate
         # for padding
@@ -1106,7 +1119,8 @@ def prepare_input_tensors(
             lora_mapping=lora_mapping,
             multi_modal_kwargs=multi_modal_input,
             real_batch_size=real_batch_size,
-            batch_size_padded=batch_size_padded), sampling_metadata
+            batch_size_padded=batch_size_padded,
+            mask=mask), sampling_metadata
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -1651,6 +1665,11 @@ def execute_model(
                 "warmup_mode": warmup_mode
             })
 
+        if model_input.mask is not None:
+            execute_model_kwargs.update({
+                "mask": model_input.mask
+            })
+
         htorch.core.mark_step()
         if self.is_driver_worker:
             model_event_name = ("model_"
@@ -1661,7 +1680,7 @@ def execute_model(
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
-            hidden_states = self.model.forward(
+            hidden_states, _ = self.model.forward(
                 **execute_model_kwargs,
                 selected_token_indices=sampling_metadata.selected_token_indices
             )

From 0a15fb85b963b6104464dbcb54a13c14bce60fdd Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 28 Aug 2024 14:16:07 +0300
Subject: [PATCH 150/341] Fix crash for LoRA disabled

---
 vllm/hpu/ops.py                    | 12 +++--
 vllm/worker/habana_model_runner.py | 79 +++++++++++++++---------------
 2 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 9e17339b0e574..67e3737f9d8d8 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -230,13 +230,15 @@ def dispatch_bgmv_linear(
         wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
         wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1)
         out = x @ wa
-        assert(out.shape == decode.mask.shape)
+        assert (out.shape == decode.mask.shape)
         out = out * decode.mask
-        out = out@wb
+        out = out @ wb
     else:
         indices = indices % max_loras
-        wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
-        wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+        wa = torch.index_select(wa_t_all, 0,
+                                indices)[:, 0, :, :].transpose(-1, -2)
+        wb = torch.index_select(wb_t_all, 0,
+                                indices)[:, 0, :, :].transpose(-1, -2)
 
         x = x.unsqueeze(1)
         out = x @ wa
@@ -278,4 +280,4 @@ def dispatch_bgmv_embedding(
     x = x.unsqueeze(1)
     out = x @ wa
     out = out.squeeze(1)
-    y += out * scale
\ No newline at end of file
+    y += out * scale
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 9d7881768a0d0..07aeaffbcab82 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -258,19 +258,17 @@ class PreparePromptMetadata(NamedTuple):
 
     @classmethod
     def empty(cls):
-        return PreparePromptMetadata(
-            input_tokens=[],
-            input_positions=[],
-            attn_metadata=None,
-            seq_lens=[],
-            query_lens=[],
-            lora_index_mapping=[],
-            lora_prompt_mapping=[],
-            lora_requests=set(),
-            multi_modal_input=None,
-            slot_mapping=[],
-            mask = None
-        )
+        return PreparePromptMetadata(input_tokens=[],
+                                     input_positions=[],
+                                     attn_metadata=None,
+                                     seq_lens=[],
+                                     query_lens=[],
+                                     lora_index_mapping=[],
+                                     lora_prompt_mapping=[],
+                                     lora_requests=set(),
+                                     multi_modal_input=None,
+                                     slot_mapping=[],
+                                     mask=None)
 
 
 class PrepareDecodeMetadata(NamedTuple):
@@ -637,6 +635,7 @@ def _prepare_prompt(
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
         mask = None
+        counter = 0
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -751,10 +750,12 @@ def _prepare_prompt(
 
         if self.lora_config:
             mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
-                                      (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
-                                      dtype=self.lora_config.lora_dtype)
-            ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
-            counter = 0
+                               (self.lora_config.max_loras + 1) *
+                               self.lora_config.max_lora_rank,
+                               dtype=self.lora_config.lora_dtype)
+            ones = torch.ones(max_prompt_len,
+                              self.lora_config.max_lora_rank,
+                              dtype=self.lora_config.lora_dtype)
         for seq_group_metadata, context_len in zip(seq_group_metadata_list,
                                                    context_lens):
             lora_id = seq_group_metadata.lora_int_id
@@ -773,7 +774,7 @@ def _prepare_prompt(
                 [lora_id] *
                 (max_prompt_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
-            
+
         if mask is not None:
             mask = mask.to('hpu')
 
@@ -861,13 +862,16 @@ def _prepare_decode(
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
         mask = None
+        counter = 0
 
         if self.lora_config:
             mask = torch.zeros(len(seq_group_metadata_list),
-                                      (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank,
-                                      dtype=self.lora_config.lora_dtype)
-            ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype)
-            counter = 0
+                               (self.lora_config.max_loras + 1) *
+                               self.lora_config.max_lora_rank,
+                               dtype=self.lora_config.lora_dtype)
+            ones = torch.ones(1,
+                              self.lora_config.max_lora_rank,
+                              dtype=self.lora_config.lora_dtype)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
@@ -1109,18 +1113,17 @@ def prepare_input_tensors(
         attn_metadata = prefill_attn_metadata if \
             prefill_attn_metadata is not None else decode_attn_metadata
 
-        return self._model_input_cls(
-            input_tokens=input_tokens,
-            seq_lens=seq_lens,
-            query_lens=query_lens,
-            input_positions=input_positions,
-            attn_metadata=attn_metadata,
-            lora_requests=lora_requests,
-            lora_mapping=lora_mapping,
-            multi_modal_kwargs=multi_modal_input,
-            real_batch_size=real_batch_size,
-            batch_size_padded=batch_size_padded,
-            mask=mask), sampling_metadata
+        return self._model_input_cls(input_tokens=input_tokens,
+                                     seq_lens=seq_lens,
+                                     query_lens=query_lens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     lora_requests=lora_requests,
+                                     lora_mapping=lora_mapping,
+                                     multi_modal_kwargs=multi_modal_input,
+                                     real_batch_size=real_batch_size,
+                                     batch_size_padded=batch_size_padded,
+                                     mask=mask), sampling_metadata
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -1655,7 +1658,8 @@ def execute_model(
             "positions": input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
-            "intermediate_tensors": intermediate_tensors
+            "intermediate_tensors": intermediate_tensors,
+            "mask": model_input.mask
         }
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
@@ -1665,11 +1669,6 @@ def execute_model(
                 "warmup_mode": warmup_mode
             })
 
-        if model_input.mask is not None:
-            execute_model_kwargs.update({
-                "mask": model_input.mask
-            })
-
         htorch.core.mark_step()
         if self.is_driver_worker:
             model_event_name = ("model_"

From 038e36b2af58e4ca54e38cd544c04419b90de49c Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Mon, 2 Sep 2024 13:26:24 +0300
Subject: [PATCH 151/341] Remove Global variable

---
 vllm/decode.py                     |  3 --
 vllm/hpu/ops.py                    | 53 ++++++++++++++++++++----------
 vllm/worker/habana_model_runner.py | 20 +++++------
 3 files changed, 46 insertions(+), 30 deletions(-)
 delete mode 100644 vllm/decode.py

diff --git a/vllm/decode.py b/vllm/decode.py
deleted file mode 100644
index 1cf8ea1cdbe11..0000000000000
--- a/vllm/decode.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def init():
-    global mask
-    mask = None
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 67e3737f9d8d8..bbbb46c32a378 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -12,7 +12,6 @@
 import torch.nn.functional as F
 
 from vllm.logger import init_logger
-import vllm.decode as decode
 
 logger = init_logger(__name__)
 HPUFusedRMSNorm = None
@@ -194,6 +193,18 @@ def prompt_attention(
     return attn_weights
 
 
+class LoraMask:
+    lora_mask = None
+
+    @staticmethod
+    def setLoraMask(mask):
+        LoraMask.lora_mask = mask
+
+    @staticmethod
+    def getLoraMask():
+        return LoraMask.lora_mask
+
+
 def dispatch_bgmv_linear(
     y: torch.Tensor,
     x: torch.Tensor,
@@ -207,33 +218,41 @@ def dispatch_bgmv_linear(
     `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices
     stacked into single tensors, assuming same rank. HPU handles no-LoRA
     requests using zero valued A and B tensors. These zero valued tensors are
-    appended at the end of `wa_t_all` and `wb_t_all` during initialization. For
-    custom BGMV, the corresponding `wa` and `wb` for each batch is created
-    based on the lora_index of each sample.
-
-    For example:
-        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
-        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
-        no-LoRA case. The `wa` tensor for a batch of size batch_Size will have
-        a shape of (batch_size, num_layers, hidden_dim, lora_rank)
-
-    This method avoids for-loop as well as graph breaks.
+    appended at the end of `wa_t_all` and `wb_t_all` during initialization.
     """
+
     assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
     max_loras = wa_t_all.size(0)
     # Wrap-around for negative indices
-    if decode.mask is not None:
+    mask = LoraMask.getLoraMask()
+    if mask is not None:
+        """
+        We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank]
+        and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also
+        have a loraMask of shape [batch_size, num_layers * lora_rank]
+        """
         wa = wa_t_all[:, 0, :, :]
-        wb = wb_t_all[:, 0, :, :].transpose(0, 1)
+        wb = wb_t_all[:, 0, :, :].transpose(1, 2)
         wa_shape = wa.shape
         wb_shape = wb.shape
         wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
-        wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1)
+        wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
         out = x @ wa
-        assert (out.shape == decode.mask.shape)
-        out = out * decode.mask
+        assert (out.shape == mask.shape)
+        out = out * mask
         out = out @ wb
     else:
+        """For custom BGMV, the corresponding `wa` and `wb` for each batch is
+        created based on the lora_index of each sample.
+
+        For example:
+        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
+        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
+        no-LoRA case. The `wa` tensor for a batch of size batch_Size will have
+        a shape of (batch_size, num_layers, hidden_dim, lora_rank)
+
+        This method avoids for-loop as well as graph breaks.
+        """
         indices = indices % max_loras
         wa = torch.index_select(wa_t_all, 0,
                                 indices)[:, 0, :, :].transpose(-1, -2)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 07aeaffbcab82..98e65220edded 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -22,6 +22,7 @@
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
+from vllm.hpu.ops import LoraMask as LoraMask
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
@@ -33,7 +34,6 @@
                            SequenceGroupMetadata)
 from vllm.utils import (HabanaMemoryProfiler, format_bytes,
                         is_pin_memory_available, make_tensor_with_pad)
-import vllm.decode as decode
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -230,11 +230,11 @@ def forward(self, *args, **kwargs):
                                                       input_ids.size(1),
                                                       input_ids.device,
                                                       torch.bfloat16)
-        decode.mask = kwargs.pop('mask')
+        LoraMask.setLoraMask(kwargs.pop('mask'))
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)
-        return hidden_states, decode.mask
+        return hidden_states
 
     def compute_logits(self, *args, **kwargs):
         return self.model.compute_logits(*args, **kwargs)
@@ -339,7 +339,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "real_batch_size": self.real_batch_size,
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
-            "mask": mask
+            "mask": self.mask
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -634,8 +634,6 @@ def _prepare_prompt(
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
-        mask = None
-        counter = 0
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -748,6 +746,8 @@ def _prepare_prompt(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
 
+        mask: torch.Tensor = None
+        counter = 0
         if self.lora_config:
             mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
                                (self.lora_config.max_loras + 1) *
@@ -861,7 +861,7 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
-        mask = None
+        mask: torch.Tensor = None
         counter = 0
 
         if self.lora_config:
@@ -1251,7 +1251,7 @@ def warmup_scenario(self,
                 if dummy_lora_requests_per_seq else None)
             for i in range(batch_size)
         ]
-        #torch.hpu.synchronize()
+        torch.hpu.synchronize()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
             self.execute_model(inputs, kv_caches, warmup_mode=True)
@@ -1679,7 +1679,7 @@ def execute_model(
         else:
             model_event_name = 'model_executable'
         with self.profiler.record_event('internal', model_event_name):
-            hidden_states, _ = self.model.forward(
+            hidden_states = self.model.forward(
                 **execute_model_kwargs,
                 selected_token_indices=sampling_metadata.selected_token_indices
             )
@@ -1693,7 +1693,7 @@ def execute_model(
                         module.indices_len[
                             i] = sampling_metadata.selected_token_indices.numel(
                             )
-            decode.mask = None
+            LoraMask.setLoraMask(None)
 
         # Compute the logits.
         with self.profiler.record_event(

From ab369e3734ace66fbf94623c55869e9642431b06 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Mon, 2 Sep 2024 13:50:16 +0300
Subject: [PATCH 152/341] Remove limitation on max_num_batched_tokens when
 using LoRA

---
 vllm/config.py      |  2 +-
 vllm/lora/layers.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6acb70ad047b2..7aa3977a497ea 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1326,7 +1326,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        if scheduler_config.max_num_batched_tokens > 65528:
+        if not is_hpu() and scheduler_config.max_num_batched_tokens > 65528:
             raise ValueError(
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 4a45f3fda88f1..aa01e9fb77af2 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -327,6 +327,17 @@ def set_mapping(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
         embedding_len = self.indices_len[3]
+        # NOTE(vgoel): These asserts can be skipped when upstreaming.
+        # Can be removed from vllm-fork also once lora functionality
+        # on Gaudi stabilizes.
+        if is_hpu():
+            emb_len = embedding_len
+            x_shape = x.shape
+            ind_shape = self.embeddings_indices[1].shape
+            assert embedding_len == x.shape[0] * x.shape[1], \
+                 f"Extra Info: {emb_len}, {x_shape}, {ind_shape}"
+            assert embedding_len <= self.embeddings_indices[1].shape[0], \
+                f"Extra Info: {emb_len}, {x.shape}, {ind_shape}"
         indices = self.embeddings_indices[1][:embedding_len].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,

From b4f6a295c3e663872a8dadb0caca0fd21995e105 Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jkaniecki@habana.ai>
Date: Tue, 3 Sep 2024 13:24:12 +0200
Subject: [PATCH 153/341] Remove mark step from static MoE loop (#231)

Removes unnecessary mark step from MoE OP loop to speed up computation
---
 vllm/hpu/ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 662c53486b4ca..f3ffe7edbc49d 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -128,7 +128,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk):
         w_output = silu_and_mul(w_output)
         w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
         final_hidden_states += w_output * padded_weights[expert_idx]
-        htorch.core.mark_step()
 
     return final_hidden_states.view(-1, D)
 

From 733524ae2fe163b69335aab95a493acf451b0ddb Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 3 Sep 2024 15:26:27 +0000
Subject: [PATCH 154/341] Add newline at EOF

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 Dockerfile.hpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index b9acec2b85be4..ab714cdac4670 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -15,4 +15,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

From fb98cad144e9654abcc698c4b56d793d1d56cce7 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 3 Sep 2024 16:30:17 +0000
Subject: [PATCH 155/341] Remove requires_grad=False

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/model_executor/models/arctic.py | 6 ++----
 vllm/model_executor/models/dbrx.py   | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 6d92e7597eabf..603579d41946e 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -131,14 +131,12 @@ def __init__(self,
                     torch.empty(self.num_experts,
                                 2 * self.intermediate_size,
                                 self.hidden_size,
-                                dtype=self.params_dtype),
-                    , requires_grad=False)
+                                dtype=self.params_dtype))
                 self.w2s = nn.Parameter(
                     torch.empty(self.num_experts,
                                 self.hidden_size,
                                 self.intermediate_size,
-                                dtype=self.params_dtype), 
-                    requires_grad=False)
+                                dtype=self.params_dtype))
             set_weight_attrs(self.ws, {
                 "weight_loader": self.weight_loader,
             })
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 463003d0bba7b..e3a45b26d909b 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -86,15 +86,13 @@ def __init__(
                 self.num_total_experts,
                 2 * self.intermediate_size,
                 self.d_model,
-                dtype=self.params_dtype,
-            ), requires_grad=False)
+                dtype=self.params_dtype))
         self.w2s = nn.Parameter(
             torch.empty(
                 self.num_total_experts,
                 self.d_model,
                 self.intermediate_size,
-                dtype=self.params_dtype,
-            ), requires_grad=False)
+                dtype=self.params_dtype))
 
         set_weight_attrs(
             self.ws,

From 49ffde681d48263f6b1181604bfe5c56049c6f45 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 4 Sep 2024 06:59:38 +0300
Subject: [PATCH 156/341] Change mask to lora_mask

---
 vllm/worker/habana_model_runner.py | 63 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 98e65220edded..4b65a7ef46721 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -230,7 +230,7 @@ def forward(self, *args, **kwargs):
                                                       input_ids.size(1),
                                                       input_ids.device,
                                                       torch.bfloat16)
-        LoraMask.setLoraMask(kwargs.pop('mask'))
+        LoraMask.setLoraMask(kwargs.pop('lora_mask'))
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         hidden_states = hidden_states.index_select(0, selected_token_indices)
@@ -254,7 +254,7 @@ class PreparePromptMetadata(NamedTuple):
     lora_requests: Set[LoRARequest]
     multi_modal_input: Optional[torch.Tensor]
     slot_mapping: List[List[int]]
-    mask: Optional[torch.Tensor]
+    lora_mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
@@ -268,7 +268,7 @@ def empty(cls):
                                      lora_requests=set(),
                                      multi_modal_input=None,
                                      slot_mapping=[],
-                                     mask=None)
+                                     lora_mask=None)
 
 
 class PrepareDecodeMetadata(NamedTuple):
@@ -279,7 +279,7 @@ class PrepareDecodeMetadata(NamedTuple):
     lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
     slot_mapping: List[List[int]]
-    mask: Optional[torch.Tensor]
+    lora_mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
@@ -291,7 +291,7 @@ def empty(cls):
             lora_prompt_mapping=[],
             lora_requests=set(),
             slot_mapping=[],
-            mask=None,
+            lora_mask=None,
         )
 
 
@@ -327,7 +327,7 @@ class ModelInputForHPU(ModelRunnerInputBase):
     real_batch_size: Optional[int] = None
     batch_size_padded: Optional[int] = None
     virtual_engine: int = 0
-    mask: Optional[torch.Tensor] = None
+    lora_mask: Optional[torch.Tensor] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -339,7 +339,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "real_batch_size": self.real_batch_size,
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
-            "mask": self.mask
+            "lora_mask": self.lora_mask,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -373,7 +373,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
-            "mask": self.mask
+            "lora_mask": self.lora_mask,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -746,13 +746,14 @@ def _prepare_prompt(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
 
-        mask: torch.Tensor = None
+        lora_mask: torch.Tensor = None
         counter = 0
         if self.lora_config:
-            mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len,
-                               (self.lora_config.max_loras + 1) *
-                               self.lora_config.max_lora_rank,
-                               dtype=self.lora_config.lora_dtype)
+            lora_mask = torch.zeros(len(seq_group_metadata_list) *
+                                    max_prompt_len,
+                                    (self.lora_config.max_loras + 1) *
+                                    self.lora_config.max_lora_rank,
+                                    dtype=self.lora_config.lora_dtype)
             ones = torch.ones(max_prompt_len,
                               self.lora_config.max_lora_rank,
                               dtype=self.lora_config.lora_dtype)
@@ -766,7 +767,7 @@ def _prepare_prompt(
                 end_row = start_row + max_prompt_len
                 start_col = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_col = start_col + self.lora_config.max_lora_rank
-                mask[start_row:end_row, start_col:end_col] = ones
+                lora_mask[start_row:end_row, start_col:end_col] = ones
             counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
@@ -775,8 +776,8 @@ def _prepare_prompt(
                 (max_prompt_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
-        if mask is not None:
-            mask = mask.to('hpu')
+        if lora_mask is not None:
+            lora_mask = lora_mask.to('hpu')
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -843,7 +844,7 @@ def _prepare_prompt(
             lora_requests=lora_requests,
             multi_modal_input=multi_modal_input,
             slot_mapping=slot_mapping,
-            mask=mask,
+            lora_mask=lora_mask,
         )
 
     def _prepare_decode(
@@ -861,14 +862,14 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
-        mask: torch.Tensor = None
+        lora_mask: torch.Tensor = None
         counter = 0
 
         if self.lora_config:
-            mask = torch.zeros(len(seq_group_metadata_list),
-                               (self.lora_config.max_loras + 1) *
-                               self.lora_config.max_lora_rank,
-                               dtype=self.lora_config.lora_dtype)
+            lora_mask = torch.zeros(len(seq_group_metadata_list),
+                                    (self.lora_config.max_loras + 1) *
+                                    self.lora_config.max_lora_rank,
+                                    dtype=self.lora_config.lora_dtype)
             ones = torch.ones(1,
                               self.lora_config.max_lora_rank,
                               dtype=self.lora_config.lora_dtype)
@@ -884,7 +885,7 @@ def _prepare_decode(
                 lora_requests.add(seq_group_metadata.lora_request)
                 start_pos = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_pos = start_pos + self.lora_config.max_lora_rank
-                mask[counter, start_pos:end_pos] = ones
+                lora_mask[counter, start_pos:end_pos] = ones
             counter = counter + 1
 
             for seq_id in seq_ids:
@@ -914,8 +915,8 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        if mask is not None:
-            mask = mask.to('hpu')
+        if lora_mask is not None:
+            lora_mask = lora_mask.to('hpu')
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -961,7 +962,7 @@ def _prepare_decode(
             lora_prompt_mapping=lora_prompt_mapping,
             lora_requests=lora_requests,
             slot_mapping=slot_mapping,
-            mask=mask,
+            lora_mask=lora_mask,
         )
 
     def prepare_input_tensors(
@@ -1016,7 +1017,7 @@ def prepare_input_tensors(
             lora_requests,
             multi_modal_input,
             slot_mapping,
-            mask,
+            lora_mask,
         ) = self._prepare_prompt(prefill_reqs)
         (
             decode_input_tokens,
@@ -1026,7 +1027,7 @@ def prepare_input_tensors(
             decode_lora_prompt_mapping,
             decode_lora_requests,
             decode_slot_mapping,
-            decode_mask,
+            decode_lora_mask,
         ) = self._prepare_decode(decode_reqs)
         sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
                                                      seq_lens, query_lens,
@@ -1053,7 +1054,7 @@ def prepare_input_tensors(
             lora_index_mapping = decode_lora_index_mapping
             lora_prompt_mapping = decode_lora_prompt_mapping
             lora_requests = decode_lora_requests
-            mask = decode_mask
+            lora_mask = decode_lora_mask
 
         # FIXME: We need to adjust selected_token_indices to accommodate
         # for padding
@@ -1123,7 +1124,7 @@ def prepare_input_tensors(
                                      multi_modal_kwargs=multi_modal_input,
                                      real_batch_size=real_batch_size,
                                      batch_size_padded=batch_size_padded,
-                                     mask=mask), sampling_metadata
+                                     lora_mask=lora_mask), sampling_metadata
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -1659,7 +1660,7 @@ def execute_model(
             "kv_caches": kv_caches,
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
             "intermediate_tensors": intermediate_tensors,
-            "mask": model_input.mask
+            "lora_mask": model_input.lora_mask
         }
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)

From 538c8f15f759ee7b18f6b738c74a00d6f304ba3a Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <hlahkar@habana.ai>
Date: Wed, 4 Sep 2024 12:21:04 +0300
Subject: [PATCH 157/341] Move compute_logits to Mask Based Implementation

---
 vllm/hpu/ops.py                    | 53 +++++++---------------------
 vllm/worker/habana_model_runner.py | 55 +++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index bbbb46c32a378..1ee56610d9ee5 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -219,50 +219,23 @@ def dispatch_bgmv_linear(
     stacked into single tensors, assuming same rank. HPU handles no-LoRA
     requests using zero valued A and B tensors. These zero valued tensors are
     appended at the end of `wa_t_all` and `wb_t_all` during initialization.
+    We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank]
+    and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also
+    have a loraMask of shape [batch_size, num_layers * lora_rank]
     """
 
     assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    max_loras = wa_t_all.size(0)
-    # Wrap-around for negative indices
     mask = LoraMask.getLoraMask()
-    if mask is not None:
-        """
-        We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank]
-        and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also
-        have a loraMask of shape [batch_size, num_layers * lora_rank]
-        """
-        wa = wa_t_all[:, 0, :, :]
-        wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-        wa_shape = wa.shape
-        wb_shape = wb.shape
-        wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
-        wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
-        out = x @ wa
-        assert (out.shape == mask.shape)
-        out = out * mask
-        out = out @ wb
-    else:
-        """For custom BGMV, the corresponding `wa` and `wb` for each batch is
-        created based on the lora_index of each sample.
-
-        For example:
-        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
-        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
-        no-LoRA case. The `wa` tensor for a batch of size batch_Size will have
-        a shape of (batch_size, num_layers, hidden_dim, lora_rank)
-
-        This method avoids for-loop as well as graph breaks.
-        """
-        indices = indices % max_loras
-        wa = torch.index_select(wa_t_all, 0,
-                                indices)[:, 0, :, :].transpose(-1, -2)
-        wb = torch.index_select(wb_t_all, 0,
-                                indices)[:, 0, :, :].transpose(-1, -2)
-
-        x = x.unsqueeze(1)
-        out = x @ wa
-        out = out @ wb
-        out = out.squeeze(1)
+    wa = wa_t_all[:, 0, :, :]
+    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
+    wa_shape = wa.shape
+    wb_shape = wb.shape
+    wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
+    wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
+    out = x @ wa
+    assert (out.shape == mask.shape)
+    out = out * mask
+    out = out @ wb
     y += out * scale
 
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 4b65a7ef46721..e03c9167ad308 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -255,6 +255,7 @@ class PreparePromptMetadata(NamedTuple):
     multi_modal_input: Optional[torch.Tensor]
     slot_mapping: List[List[int]]
     lora_mask: Optional[torch.Tensor]
+    lora_logits_mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
@@ -268,7 +269,8 @@ def empty(cls):
                                      lora_requests=set(),
                                      multi_modal_input=None,
                                      slot_mapping=[],
-                                     lora_mask=None)
+                                     lora_mask=None,
+                                     lora_logits_mask=None)
 
 
 class PrepareDecodeMetadata(NamedTuple):
@@ -280,19 +282,19 @@ class PrepareDecodeMetadata(NamedTuple):
     lora_requests: Set[LoRARequest]
     slot_mapping: List[List[int]]
     lora_mask: Optional[torch.Tensor]
+    lora_logits_mask: Optional[torch.Tensor]
 
     @classmethod
     def empty(cls):
-        return PrepareDecodeMetadata(
-            input_tokens=[],
-            input_positions=[],
-            attn_metadata=None,
-            lora_index_mapping=[],
-            lora_prompt_mapping=[],
-            lora_requests=set(),
-            slot_mapping=[],
-            lora_mask=None,
-        )
+        return PrepareDecodeMetadata(input_tokens=[],
+                                     input_positions=[],
+                                     attn_metadata=None,
+                                     lora_index_mapping=[],
+                                     lora_prompt_mapping=[],
+                                     lora_requests=set(),
+                                     slot_mapping=[],
+                                     lora_mask=None,
+                                     lora_logits_mask=None)
 
 
 # How batches are constructed.
@@ -328,6 +330,7 @@ class ModelInputForHPU(ModelRunnerInputBase):
     batch_size_padded: Optional[int] = None
     virtual_engine: int = 0
     lora_mask: Optional[torch.Tensor] = None
+    lora_logits_mask: Optional[torch.Tensor] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -340,6 +343,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
             "lora_mask": self.lora_mask,
+            "lora_logits_mask": self.lora_logits_mask,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -374,6 +378,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
             "lora_mask": self.lora_mask,
+            "lora_logits_mask": self.lora_logits_mask,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -747,6 +752,7 @@ def _prepare_prompt(
             self.block_size)
 
         lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
         counter = 0
         if self.lora_config:
             lora_mask = torch.zeros(len(seq_group_metadata_list) *
@@ -754,9 +760,17 @@ def _prepare_prompt(
                                     (self.lora_config.max_loras + 1) *
                                     self.lora_config.max_lora_rank,
                                     dtype=self.lora_config.lora_dtype)
+            lora_logits_mask = torch.zeros(len(seq_group_metadata_list),
+                                           (self.lora_config.max_loras + 1) *
+                                           self.lora_config.max_lora_rank,
+                                           dtype=self.lora_config.lora_dtype)
+
             ones = torch.ones(max_prompt_len,
                               self.lora_config.max_lora_rank,
                               dtype=self.lora_config.lora_dtype)
+            logit_ones = torch.ones(1,
+                                    self.lora_config.max_lora_rank,
+                                    dtype=self.lora_config.lora_dtype)
         for seq_group_metadata, context_len in zip(seq_group_metadata_list,
                                                    context_lens):
             lora_id = seq_group_metadata.lora_int_id
@@ -768,6 +782,7 @@ def _prepare_prompt(
                 start_col = (lora_id - 1) * self.lora_config.max_lora_rank
                 end_col = start_col + self.lora_config.max_lora_rank
                 lora_mask[start_row:end_row, start_col:end_col] = ones
+                lora_logits_mask[counter, start_col:end_col] = logit_ones
             counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
@@ -778,6 +793,7 @@ def _prepare_prompt(
 
         if lora_mask is not None:
             lora_mask = lora_mask.to('hpu')
+            lora_logits_mask = lora_logits_mask.to('hpu')
 
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
@@ -845,6 +861,7 @@ def _prepare_prompt(
             multi_modal_input=multi_modal_input,
             slot_mapping=slot_mapping,
             lora_mask=lora_mask,
+            lora_logits_mask=lora_logits_mask,
         )
 
     def _prepare_decode(
@@ -863,6 +880,7 @@ def _prepare_decode(
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
         lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
         counter = 0
 
         if self.lora_config:
@@ -917,6 +935,7 @@ def _prepare_decode(
 
         if lora_mask is not None:
             lora_mask = lora_mask.to('hpu')
+            lora_logits_mask = lora_mask
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -963,6 +982,7 @@ def _prepare_decode(
             lora_requests=lora_requests,
             slot_mapping=slot_mapping,
             lora_mask=lora_mask,
+            lora_logits_mask=lora_logits_mask,
         )
 
     def prepare_input_tensors(
@@ -1018,6 +1038,7 @@ def prepare_input_tensors(
             multi_modal_input,
             slot_mapping,
             lora_mask,
+            lora_logits_mask,
         ) = self._prepare_prompt(prefill_reqs)
         (
             decode_input_tokens,
@@ -1028,6 +1049,7 @@ def prepare_input_tensors(
             decode_lora_requests,
             decode_slot_mapping,
             decode_lora_mask,
+            decode_lora_logits_mask,
         ) = self._prepare_decode(decode_reqs)
         sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
                                                      seq_lens, query_lens,
@@ -1055,6 +1077,7 @@ def prepare_input_tensors(
             lora_prompt_mapping = decode_lora_prompt_mapping
             lora_requests = decode_lora_requests
             lora_mask = decode_lora_mask
+            lora_logits_mask = decode_lora_logits_mask
 
         # FIXME: We need to adjust selected_token_indices to accommodate
         # for padding
@@ -1124,7 +1147,9 @@ def prepare_input_tensors(
                                      multi_modal_kwargs=multi_modal_input,
                                      real_batch_size=real_batch_size,
                                      batch_size_padded=batch_size_padded,
-                                     lora_mask=lora_mask), sampling_metadata
+                                     lora_mask=lora_mask,
+                                     lora_logits_mask=lora_logits_mask), \
+                                        sampling_metadata
 
     def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
@@ -1198,6 +1223,7 @@ def profile_run(self) -> None:
                              True,
                              kv_caches,
                              is_profile_run=True)
+        return
 
     def warmup_scenario(self,
                         batch_size,
@@ -1694,7 +1720,10 @@ def execute_model(
                         module.indices_len[
                             i] = sampling_metadata.selected_token_indices.numel(
                             )
-            LoraMask.setLoraMask(None)
+            lora_logits_mask: torch.Tensor = model_input.lora_logits_mask
+            LoraMask.setLoraMask(
+                lora_logits_mask.index_select(
+                    0, sampling_metadata.selected_token_indices))
 
         # Compute the logits.
         with self.profiler.record_event(

From 691255b5e8b408d0746eb460c3f1152f819d9c76 Mon Sep 17 00:00:00 2001
From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com>
Date: Wed, 4 Sep 2024 13:30:54 +0200
Subject: [PATCH 158/341] Enable llama-405b - w/a for memory allocation error
 (#184)

Work around for allocation error while loading llama-405b.
---
 vllm/model_executor/models/llama.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 676a51ce67f96..d659d0a3f1127 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -517,6 +517,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            if current_platform.is_hpu():
+                torch.hpu.synchronize()
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should

From a4e1d5273bdf5b36eb03cbaee763a54282470e59 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 4 Sep 2024 13:45:30 +0200
Subject: [PATCH 159/341] [bugfix] handle large bucket minimums correctly
 (#235)

This bugfix addresses incorrect lower boundary handling for bucketing

Previous behavior:
```
INFO 09-03 19:36:28 habana_model_runner.py:564] Prompt bucket config (min, step, max_warmup) bs:[64, 32, 64], seq:[768, 128, 768]
INFO 09-03 19:36:28 habana_model_runner.py:577] Generated 12 prompt buckets: [(32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768)]
INFO 09-03 19:36:28 habana_model_runner.py:582] Omitted 0 prompt buckets due to exceeded token budget (max_num_batched_tokens=131072)
INFO 09-03 19:36:28 habana_model_runner.py:590] Decode bucket config (min, step, max_warmup) bs:[64, 128, 64], seq:[768, 128, 1024]
INFO 09-03 19:36:28 habana_model_runner.py:601] Generated 8 decode buckets: [(64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024)]
INFO 09-03 19:36:28 habana_model_runner.py:606] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=131072)
```
Min seq len dimension is set to 768, but buckets with seq_len=128-768
are present

Current behavior:

```
INFO 09-03 19:45:42 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[64, 32, 64], seq:[768, 128, 768]
INFO 09-03 19:45:42 habana_model_runner.py:576] Generated 1 prompt buckets: [(64, 768)]
INFO 09-03 19:45:42 habana_model_runner.py:581] Omitted 0 prompt buckets due to exceeded token budget (max_num_batched_tokens=131072)
INFO 09-03 19:45:42 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[64, 128, 64], seq:[768, 128, 1024]
INFO 09-03 19:45:42 habana_model_runner.py:600] Generated 3 decode buckets: [(64, 768), (64, 896), (64, 1024)]
INFO 09-03 19:45:42 habana_model_runner.py:605] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=131072)
```
No bucket with seq_len < 768 is captured
---
 vllm/worker/habana_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index dec1b65858eb4..d80861babea45 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -91,7 +91,8 @@ def warmup_range(config: Tuple[int, int, int]):
     ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
         ramp_up_acc)
     stable = range(bstep, bmax + 1, bstep)
-    return list(ramp_up_tw) + list(stable)
+    buckets = list(ramp_up_tw) + list(stable)
+    return list(filter(lambda bucket: bucket >= bmin, buckets))
 
 
 def warmup_buckets(bs_bucket_config, seq_bucket_config,

From 8046d81cf279828be7b4d9a0b2a242e592748302 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Wed, 4 Sep 2024 02:17:11 +0000
Subject: [PATCH 160/341] fix guided_decode HPU failing issue

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 .../guided_decoding/outlines_logits_processors.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 1c8f6cccb3e9a..5382f0f655264 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -61,7 +61,7 @@ def __call__(self, input_ids: List[int],
                           -math.inf,
                           device=scores.device)
         mask[allowed_tokens] = 0
-        scores.add_(mask)
+        scores = scores.add(mask)
         return scores
 
 
From 7cd226c0110a4fcbc01f85df73dd334994e1d767 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 5 Sep 2024 11:30:01 +0200
Subject: [PATCH 161/341] Remove token budget from decode buckets (#241)

This PR prevents max_num_batched_tokens from limiting decode buckets, as
decode buckets should be limited by number of blocks, not by
max_num_batched_tokens.
---
 vllm/worker/habana_model_runner.py | 66 ++++++++++++++----------------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d80861babea45..92df83bd968d2 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -95,8 +95,9 @@ def warmup_range(config: Tuple[int, int, int]):
     return list(filter(lambda bucket: bucket >= bmin, buckets))
 
 
-def warmup_buckets(bs_bucket_config, seq_bucket_config,
-                   max_num_batched_tokens):
+def warmup_buckets(bs_bucket_config,
+                   seq_bucket_config,
+                   max_num_batched_tokens=None):
     buckets = list(
         itertools.product(warmup_range(bs_bucket_config),
                           warmup_range(seq_bucket_config)))
@@ -107,28 +108,32 @@ def warmup_buckets(bs_bucket_config, seq_bucket_config,
                f"seq:{seq_bucket_config}")
         raise ValueError(msg)
 
-    # Remove buckets exceeding batch token budget
-    filtered_buckets = list(
-        filter(lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
-               buckets))
-
-    if len(filtered_buckets) == 0:
-        # legacy case - we can handle this if we ignore max_num_batched_tokens
-        min_bucket_bs, min_bucket_seq = min(buckets,
-                                            key=lambda b: (b[0] * b[1]))
-        min_reqd_budget = min_bucket_bs * min_bucket_seq
-        msg = (
-            "The current bucketing configuration "
-            f"(min, step, max_warmup): "
-            f"bs:{bs_bucket_config}, "
-            f"seq:{seq_bucket_config} cannot be used with specified "
-            f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
-            f"smallest bucket ({min_reqd_budget}) would exceed token budget. "
-            "Please increase max_num_batched_tokens or decrease bucket minimum "
-            "Ignoring max_num_batched_tokens at risk of out-of-memory errors.")
-        logger.error(msg)
-        return list(sorted(buckets, key=lambda b:
-                           (b[0] * b[1], b[1], b[0]))), []
+    filtered_buckets = buckets
+    if max_num_batched_tokens is not None:
+        # Remove buckets exceeding batch token budget
+        filtered_buckets = list(
+            filter(
+                lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
+                buckets))
+
+        if len(filtered_buckets) == 0:
+            # we can handle this if we ignore max_num_batched_tokens
+            min_bucket_bs, min_bucket_seq = min(buckets,
+                                                key=lambda b: (b[0] * b[1]))
+            min_reqd_budget = min_bucket_bs * min_bucket_seq
+            msg = (
+                "The current bucketing configuration "
+                f"(min, step, max_warmup): "
+                f"bs:{bs_bucket_config}, "
+                f"seq:{seq_bucket_config} cannot be used with specified "
+                f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
+                f"smallest bucket ({min_reqd_budget}) would exceed token "
+                "budget. Please increase max_num_batched_tokens or decrease "
+                "bucket minimum Ignoring max_num_batched_tokens at risk of "
+                "out-of-memory errors.")
+            logger.error(msg)
+            return list(
+                sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), []
 
     captured_buckets = list(
         sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
@@ -589,9 +594,8 @@ def _setup_buckets(self) -> None:
                f"bs:{self.decode_bs_bucket_cfg}, "
                f"seq:{self.decode_seq_bucket_cfg}")
         logger.info(msg)
-        self.decode_buckets, decode_omitted_buckets = warmup_buckets(
-            self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg,
-            self.max_num_batched_tokens)
+        self.decode_buckets, _ = warmup_buckets(self.decode_bs_bucket_cfg,
+                                                self.decode_seq_bucket_cfg)
         if self.lora_config:
             self.decode_buckets[:] = [
                 bucket for bucket in self.decode_buckets
@@ -601,14 +605,6 @@ def _setup_buckets(self) -> None:
                f"{list(sorted(self.decode_buckets))}")
         logger.info(msg)
 
-        msg = (f"Omitted {len(decode_omitted_buckets)} "
-               "decode buckets due to exceeded token budget "
-               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
-        logger.info(msg)
-
-        msg = f"Omitted decode buckets: {list(sorted(decode_omitted_buckets))}"
-        logger.debug(msg)
-
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],

From d0eb7d7087dea5bac4f918a1fc545733d6f72f27 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 5 Sep 2024 11:30:40 +0200
Subject: [PATCH 162/341] [habana_main bugfix] Fix min bucket boundary
 calculation (#239)

Ports https://github.com/HabanaAI/vllm-fork/pull/97 to habana_main
---
 vllm/worker/habana_model_runner.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 92df83bd968d2..dbd538e45027c 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -142,8 +142,8 @@ def warmup_buckets(bs_bucket_config,
     return captured_buckets, omitted_buckets
 
 
-def next_pow2(value: int):
-    res = 1
+def next_pow2(value: int, base: int):
+    res = base
     while value > 1:
         value = (value + 1) // 2
         res *= 2
@@ -155,12 +155,10 @@ def round_up(value: int, k: int):
 
 
 def find_bucket(value: int, config: Tuple[int, int, int]):
-    bmin, bstep, bmax = config
-    if value < bstep:
-        result = min(next_pow2(value), bstep)
-    else:
-        result = round_up(value, bstep)
-    return result
+    bmin, bstep, _ = config
+    next_step = round_up(value, bstep)
+    next_pow = next_pow2(value, bmin)
+    return max(bmin, min(next_step, next_pow))
 
 
 def subtuple(obj: object,

From d2e2854ed3a99681aed60c177aa36fb7e9945fe8 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 6 Sep 2024 01:30:43 +0000
Subject: [PATCH 163/341] fix rotary embedding

---
 vllm/hpu/rotary_embed.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 30a88d68a24af..1857253f47f1b 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -100,6 +100,11 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor,
              self.head_size))
         key = key.reshape((key.shape[0], key.shape[1],
                            key.shape[2] // self.head_size, self.head_size))
+        query_rot = query[..., :self.dim]
+        key_rot = key[..., :self.dim]
+        if self.dim < self.head_size:
+            query_pass = query[..., self.dim:]
+            key_pass = key[..., self.dim:]
 
         if len(positions[0]) == 1:
             cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
@@ -107,8 +112,11 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor,
         else:
             cos = cos[positions].unsqueeze(2)
             sin = sin[positions].unsqueeze(2)
-        query, key = FusedRoPE.apply(query, cos, sin,
-                                     0), FusedRoPE.apply(key, cos, sin, 0)
+        query, key = FusedRoPE.apply(query_rot, cos, sin,
+                                     0), FusedRoPE.apply(key_rot, cos, sin, 0)
+        if self.dim < self.head_size:
+            query = torch.cat((query, query_pass), dim=-1)
+            key = torch.cat((key, key_pass), dim=-1)
         return query.reshape(
             (query.shape[0], query.shape[1],
              query.shape[2] * query.shape[3])), key.reshape(

From 97bd0fdc079b20027f69f5db2494451bdce2b10d Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Tue, 3 Sep 2024 10:15:17 +0300
Subject: [PATCH 164/341] =?UTF-8?q?Avoiding=20torch.index=5Fselect=20for?=
 =?UTF-8?q?=20embedding=20LoRA=E2=80=93B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm/hpu/ops.py | 60 +++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 746e87dad4aea..bacb755b39393 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -215,22 +215,24 @@ def dispatch_bgmv_linear(
 ):
     """
     `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices
-    stacked into single tensors, assuming same rank. HPU handles no-LoRA
-    requests using zero valued A and B tensors. These zero valued tensors are
-    appended at the end of `wa_t_all` and `wb_t_all` during initialization.
-    We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank]
-    and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also
-    have a loraMask of shape [batch_size, num_layers * lora_rank]
+    stacked at dimension 0 into single tensors, assuming same rank. `wa` is the
+    reshaped and transposed version of `wa_t_all` of shape
+    (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped
+    version of `wb_t_all` of shape (max_loras * lora_rank, h_out).
+
+    Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of
+    inactive LoRA indices. Matmul masked output with `wb` and scale it to get
+    the final output.
     """
 
     assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
     mask = LoraMask.getLoraMask()
+
     wa = wa_t_all[:, 0, :, :]
     wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-    wa_shape = wa.shape
-    wb_shape = wb.shape
-    wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1)
-    wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2])
+    wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1)
+    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
+
     out = x @ wa
     assert (out.shape == mask.shape)
     out = out * mask
@@ -241,34 +243,28 @@ def dispatch_bgmv_linear(
 def dispatch_bgmv_embedding(
     y: torch.Tensor,
     x: torch.Tensor,
-    wa_t_all: torch.Tensor,
+    wb_t_all: torch.Tensor,
     indices: torch.LongTensor,
     layer_idx: int,
     scale: float,
 ):
     """
-    `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor
-    assuming same rank. HPU handles no-LoRA requests using zero valued A
-    tensor. This zero valued tensor is appended at the end of `wa_t_all` during
-    initialization. For custom BGMV, the corresponding wa for each batch is
-    created based on the lora_index of the sample.
-
-    For example:
-        `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank,
-        hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles
-        no-LoRA case. The wa tensor for a batch of size batch_Size will have a
-        shape of (batch_size, num_layers, lora_rank, hidden_dim)
-
-
-    This method avoids for-loop as well as graph breaks.
+    `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into
+    a single tensor, assuming same rank. `wb` is the transposed and reshaped
+    version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim).
+
+    Output of LoRA-A embedding (tensor x) is repeated max_loras times to match
+    the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive
+    LoRA indices. Matmul masked output with `wb` and scale it to get the final
+    output.
     """
+
     assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    max_loras = wa_t_all.size(0)
-    # Wrap-around for negative indices
-    indices = indices % max_loras
-    wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2)
+    max_loras = wb_t_all.size(0)
 
-    x = x.unsqueeze(1)
-    out = x @ wa
-    out = out.squeeze(1)
+    x = x.repeat(1, max_loras)
+    x = x * LoraMask.getLoraMask()
+    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
+    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
+    out = x @ wb
     y += out * scale

From ededdaf38bb7a141c9db03a5df060c72eca68b51 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Wed, 4 Sep 2024 15:29:24 +0300
Subject: [PATCH 165/341] Remove special handling of no-LoRA case

---
 vllm/lora/models.py                | 20 +++-----------------
 vllm/worker/habana_model_runner.py |  6 +++---
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 30d2fd9502977..e8d39591cb17a 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -24,7 +24,7 @@
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
-from vllm.utils import get_device, is_hpu, is_pin_memory_available
+from vllm.utils import get_device, is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -465,25 +465,11 @@ def __init__(
 
     @property
     def capacity(self) -> int:
-        if is_hpu():
-            # HPU handles no LoRA requests using zero valued A and B tensors.
-            # These zero valued tensors are appended at the end of A and B,
-            # making total number of loras to be lora_config.max_cpu_loras + 1.
-            # This demands the total number of max_cpu_loras to be
-            # lora_config.max_cpu_loras + 1
-            return self.lora_config.max_cpu_loras + 1
-        else:
-            return self.lora_config.max_cpu_loras
+        return self.lora_config.max_cpu_loras
 
     @property
     def lora_slots(self) -> int:
-        if is_hpu():
-            # HPU handles no LoRA requests using zero valued A and B tensors.
-            # These zero valued tensors are appended at the end of A and B,
-            # making total number of loras to be lora_config.max_cpu_loras + 1.
-            return self.lora_config.max_loras + 1
-        else:
-            return self.lora_config.max_loras
+        return self.lora_config.max_loras
 
     @property
     def adapter_slots(self) -> int:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a9a3f35d3934b..bf708c9ab01d7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -752,11 +752,11 @@ def _prepare_prompt(
         if self.lora_config:
             lora_mask = torch.zeros(len(seq_group_metadata_list) *
                                     max_prompt_len,
-                                    (self.lora_config.max_loras + 1) *
+                                    (self.lora_config.max_loras) *
                                     self.lora_config.max_lora_rank,
                                     dtype=self.lora_config.lora_dtype)
             lora_logits_mask = torch.zeros(len(seq_group_metadata_list),
-                                           (self.lora_config.max_loras + 1) *
+                                           (self.lora_config.max_loras) *
                                            self.lora_config.max_lora_rank,
                                            dtype=self.lora_config.lora_dtype)
 
@@ -880,7 +880,7 @@ def _prepare_decode(
 
         if self.lora_config:
             lora_mask = torch.zeros(len(seq_group_metadata_list),
-                                    (self.lora_config.max_loras + 1) *
+                                    (self.lora_config.max_loras) *
                                     self.lora_config.max_lora_rank,
                                     dtype=self.lora_config.lora_dtype)
             ones = torch.ones(1,

From b507cc4a33c79e241072c6ebf8ec9cf2189ee90a Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Wed, 4 Sep 2024 17:50:08 +0300
Subject: [PATCH 166/341] Update test

---
 tests/lora/test_multilora_hpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py
index edca64fd5a2ae..64eda037ff059 100644
--- a/tests/lora/test_multilora_hpu.py
+++ b/tests/lora/test_multilora_hpu.py
@@ -96,7 +96,7 @@ def _test_llama_multilora(sql_lora_files, tp_size):
                              enable_lora=True,
                              max_loras=2,
                              max_lora_rank=8,
-                             max_num_seqs=16,
+                             max_num_seqs=256,
                              dtype='float32',
                              tensor_parallel_size=tp_size)
     engine = LLMEngine.from_engine_args(engine_args)

From 016f34351260796dd35ab3c1a06719945ab04067 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Fri, 6 Sep 2024 08:53:40 +0300
Subject: [PATCH 167/341] Fix formatting

---
 vllm/worker/habana_model_runner.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index bf708c9ab01d7..4be178c6fb168 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -750,11 +750,10 @@ def _prepare_prompt(
         lora_logits_mask: torch.Tensor = None
         counter = 0
         if self.lora_config:
-            lora_mask = torch.zeros(len(seq_group_metadata_list) *
-                                    max_prompt_len,
-                                    (self.lora_config.max_loras) *
-                                    self.lora_config.max_lora_rank,
-                                    dtype=self.lora_config.lora_dtype)
+            lora_mask = torch.zeros(
+                len(seq_group_metadata_list) * max_prompt_len,
+                (self.lora_config.max_loras) * self.lora_config.max_lora_rank,
+                dtype=self.lora_config.lora_dtype)
             lora_logits_mask = torch.zeros(len(seq_group_metadata_list),
                                            (self.lora_config.max_loras) *
                                            self.lora_config.max_lora_rank,

From d9fa7cfccd6e858916dce70dac24a1ae339097fa Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Fri, 6 Sep 2024 15:00:06 +0200
Subject: [PATCH 168/341] Dispersed dummy slots (#243)

Use all possible slot values for dummy blocks to avoid caching issues.
---
 vllm/worker/habana_model_runner.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a9a3f35d3934b..166ad760d27ca 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -48,7 +48,11 @@
 
 logger = init_logger(__name__)
 
+# These values are assumed to be zero in several places.
+# Use caution when updating them!
 _PAD_SLOT_ID = 0
+_PAD_BLOCK_ID = 0
+
 LORA_WARMUP_RANK = 8
 _TYPE_CACHE = {}
 
@@ -937,6 +941,13 @@ def _prepare_decode(
         input_positions = torch.tensor(input_positions,
                                        dtype=torch.long,
                                        device=self.device)
+
+        dummy_slots = itertools.cycle(
+            range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
+        slot_mapping = [[
+            s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl
+        ] for sl in slot_mapping]
+
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)
@@ -1193,7 +1204,7 @@ def create_dummy_seq_group_metadata(self,
         else:
             input_len = seq_len - 1
             output_len = 1
-            block_tables = {group_id: [0] * num_blocks}
+            block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
         prompt_token_ids = [0] * input_len
         output_token_ids = [1] * output_len
         seq_data = SequenceData(prompt_token_ids)

From 7488c584ddb36a81a900614d434445e1d66dbcf0 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Fri, 6 Sep 2024 15:16:33 +0200
Subject: [PATCH 169/341] Use PT_COMPILE_ONLY_MODE during warmup (#227)

With PT_COMPILE_ONLY_MODE flag, graphs can be compiled without
performing synLaunch. The flag has been added to the warmup phase to
decrease its execution time.
---
 vllm/worker/habana_model_runner.py | 125 +++++++++++++++--------------
 1 file changed, 66 insertions(+), 59 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 166ad760d27ca..9dc02fba0213a 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -15,6 +15,7 @@
                     Optional, Set, Tuple, Type, TypeVar, Union)
 
 import habana_frameworks.torch as htorch
+import habana_frameworks.torch.internal.bridge_config as bc
 import torch
 
 from vllm.attention import AttentionMetadata, get_attn_backend
@@ -1402,67 +1403,73 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.profiler.start('internal', 'warmup')
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
-        self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
-        self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
-
-        if not self.enforce_eager and htorch.utils.internal.is_lazy():
-            assert self.mem_margin is not None, \
-                ("HabanaWorker.determine_num_available_blocks needs "
-                "to be called before warming up the model.")
-            free_mem = HabanaMemoryProfiler.current_free_device_memory()
-            graph_free_mem = free_mem - self.mem_margin
-            graph_free_mem = align_workers(graph_free_mem,
-                                           torch.distributed.ReduceOp.MIN)
-            prompt_graph_mem_ratio = float(
-                os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
-            prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem
-            decode_available_memory = graph_free_mem - prompt_available_memory
-            msg = (f"Using {format_bytes(graph_free_mem)}"
-                   f"/{format_bytes(free_mem)} "
-                   "of free device memory for HPUGraphs, "
-                   f"{format_bytes(prompt_available_memory)} for prompt and "
-                   f"{format_bytes(decode_available_memory)} for decode "
-                   f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})")
-            logger.info(msg)
-            prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY',
-                                             'min_tokens')
-            decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
-                                             'max_bs')
-            mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
-                self.warmup_graphs(
-                prompt_strategy, self.prompt_buckets, True, kv_caches,
-                prompt_available_memory)
-            mem_post_decode, decode_batch_seq, decode_captured_all = \
-                self.warmup_graphs(
-                decode_strategy, self.decode_buckets, False, kv_caches,
-                decode_available_memory)
-
-            # Not all prompt buckets were captured, but all decode buckets were
-            # captured and we have some free graph-allocated space left.
-            # Let's try to use it for capturing more prompt buckets.
-            if mem_post_decode + mem_post_prompt < graph_free_mem \
-                and not prompt_captured_all \
-                    and decode_captured_all:
-                mem_post_prompt, _, prompt_captured_all = self.warmup_graphs(
+
+        with bc.env_setting("PT_COMPILE_ONLY_MODE", True):
+            self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
+            self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
+
+            if not self.enforce_eager and htorch.utils.internal.is_lazy():
+                assert self.mem_margin is not None, \
+                    ("HabanaWorker.determine_num_available_blocks needs "
+                    "to be called before warming up the model.")
+                free_mem = HabanaMemoryProfiler.current_free_device_memory()
+                graph_free_mem = free_mem - self.mem_margin
+                graph_free_mem = align_workers(graph_free_mem,
+                                               torch.distributed.ReduceOp.MIN)
+                prompt_graph_mem_ratio = float(
+                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
+                prompt_available_memory = (prompt_graph_mem_ratio *
+                                           graph_free_mem)
+                decode_available_memory = (graph_free_mem -
+                                           prompt_available_memory)
+                msg = (
+                    f"Using {format_bytes(graph_free_mem)}"
+                    f"/{format_bytes(free_mem)} "
+                    "of free device memory for HPUGraphs, "
+                    f"{format_bytes(prompt_available_memory)} for prompt and "
+                    f"{format_bytes(decode_available_memory)} for decode "
+                    f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})")
+                logger.info(msg)
+                prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY',
+                                                 'min_tokens')
+                decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY',
+                                                 'max_bs')
+                mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
+                    self.warmup_graphs(
                     prompt_strategy, self.prompt_buckets, True, kv_caches,
-                    graph_free_mem - mem_post_prompt - mem_post_decode,
-                    mem_post_prompt, prompt_batch_seq)
-
-            # Not all decode buckets were captured, but all prompt buckets were
-            # captured and we have some free graph-allocated space left.
-            # Let's try to use it for capturing more decode buckets.
-            if mem_post_decode + mem_post_prompt < graph_free_mem \
-                and not decode_captured_all \
-                    and prompt_captured_all:
-                mem_post_decode, _, _ = self.warmup_graphs(
+                    prompt_available_memory)
+                mem_post_decode, decode_batch_seq, decode_captured_all = \
+                    self.warmup_graphs(
                     decode_strategy, self.decode_buckets, False, kv_caches,
-                    graph_free_mem - mem_post_prompt - mem_post_decode,
-                    mem_post_decode, decode_batch_seq)
-
-            self.log_graph_warmup_summary(self.prompt_buckets, True,
-                                          mem_post_prompt)
-            self.log_graph_warmup_summary(self.decode_buckets, False,
-                                          mem_post_decode)
+                    decode_available_memory)
+
+                # Not all prompt buckets were captured, but all decode buckets
+                # were captured and we have some free graph-allocated space
+                # left. Let's try to use it for capturing more prompt buckets.
+                if (mem_post_decode + mem_post_prompt < graph_free_mem
+                        and not prompt_captured_all and decode_captured_all):
+                    mem_post_prompt, _, prompt_captured_all = (
+                        self.warmup_graphs(
+                            prompt_strategy, self.prompt_buckets, True,
+                            kv_caches,
+                            graph_free_mem - mem_post_prompt - mem_post_decode,
+                            mem_post_prompt, prompt_batch_seq))
+
+                # Not all decode buckets were captured, but all prompt buckets
+                # were captured and we have some free graph-allocated space
+                # left. Let's try to use it for capturing more decode buckets.
+                if mem_post_decode + mem_post_prompt < graph_free_mem \
+                    and not decode_captured_all \
+                        and prompt_captured_all:
+                    mem_post_decode, _, _ = self.warmup_graphs(
+                        decode_strategy, self.decode_buckets, False, kv_caches,
+                        graph_free_mem - mem_post_prompt - mem_post_decode,
+                        mem_post_decode, decode_batch_seq)
+
+                self.log_graph_warmup_summary(self.prompt_buckets, True,
+                                              mem_post_prompt)
+                self.log_graph_warmup_summary(self.decode_buckets, False,
+                                              mem_post_decode)
 
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()

From 17447ede71a79e020c174c34f8c993cebc616952 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 6 Sep 2024 15:18:36 +0200
Subject: [PATCH 170/341] Do not pass warmup_mode to execute_model_kwargs
 (#229)

This fixes a very silly issue where mismatching values of `warmup_mode`
flag could cause graph recompilations and eventually memory leaks.
---
 vllm/worker/habana_model_runner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 9dc02fba0213a..f9fa2e8af5ec4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1704,10 +1704,7 @@ def execute_model(
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({
-                "bypass_hpu_graphs": not use_graphs,
-                "warmup_mode": warmup_mode
-            })
+            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
 
         htorch.core.mark_step()
         if self.is_driver_worker:

From b50aa14998c7a5fc499daadd7af19dfd94b12d18 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 6 Sep 2024 16:14:11 +0200
Subject: [PATCH 171/341] Add error handling for PT_COMPILE_ONLY_MODE (#251)

This PR fixes crashes observed on older Synapse builds introduced with
https://github.com/HabanaAI/vllm-fork/pull/227. Setting
PT_COMPILE_ONLY_MODE is not supported in current or older public Synapse
builds, but we should not crash because of it, rather we should advise
user to use the latest build.

Previous behavior:
```
...
INFO 09-06 17:08:37 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910
INFO 09-06 17:08:37 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -159.6 MiB of host memory (414.9 GiB/1007 GiB used)
[rank0]: Traceback (most recent call last):
[rank0]:   File "/software/users/kzawora/vllm-utils/vllm_hpu_simple_test.py", line 9, in <module>
[rank0]:     llm = LLM(model="facebook/opt-125m")
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/entrypoints/llm.py", line 155, in __init__
[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 456, in from_engine_args
[rank0]:     engine = cls(
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 266, in __init__
[rank0]:     self._initialize_kv_caches()
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 378, in _initialize_kv_caches
[rank0]:     self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/executor/habana_executor.py", line 89, in initialize_cache
[rank0]:     self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 202, in initialize_cache
[rank0]:     self._warm_up_model()
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 220, in _warm_up_model
[rank0]:     self.model_runner.warmup_model(self.hpu_cache[0])
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
[rank0]:     return func(*args, **kwargs)
[rank0]:   File "/software/users/kzawora/vllm-fork/vllm/worker/habana_model_runner.py", line 1412, in warmup_model
[rank0]:     with compile_only_mode_context():
[rank0]:   File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__
[rank0]:     return next(self.gen)
[rank0]:   File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/internal/bridge_config.py", line 20, in env_setting
[rank0]:     get_func = globals()['get_' + var.lower()]
[rank0]: KeyError: 'get_pt_compile_only_mode'
inc shutdown
inc shutdown
inc shutdown
inc shutdown
```

Current behavior:

```
...
INFO 09-06 17:06:42 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910
INFO 09-06 17:06:43 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -143.7 MiB of host memory (415 GiB/1007 GiB used)
WARNING 09-06 17:06:43 habana_model_runner.py:1419] Cannot use PT_COMPILE_ONLY_MODE. Warmup time will be negatively impacted. Please update Gaudi Software Suite.
INFO 09-06 17:06:43 habana_model_runner.py:1336] [Warmup][Prompt][1/23] batch_size:2 seq_len:1024 free_mem:40.28 GiB
...
```
---
 vllm/worker/habana_model_runner.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index f9fa2e8af5ec4..b62ea1c8afbfe 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -3,7 +3,9 @@
 ###############################################################################
 
 import collections
+import contextlib
 import dataclasses
+import functools
 import gc
 import itertools
 import math
@@ -1404,7 +1406,21 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
 
-        with bc.env_setting("PT_COMPILE_ONLY_MODE", True):
+        compile_only_mode_context = functools.partial(bc.env_setting,
+                                                      "PT_COMPILE_ONLY_MODE",
+                                                      True)
+        can_use_compile_only_mode = True
+        try:
+            with compile_only_mode_context():
+                pass
+            logger.debug("Using PT_COMPILE_ONLY_MODE.")
+        except KeyError:
+            can_use_compile_only_mode = False
+            logger.warning('Cannot use PT_COMPILE_ONLY_MODE. '
+                           'Warmup time will be negatively impacted. '
+                           'Please update Gaudi Software Suite.')
+        with compile_only_mode_context(
+        ) if can_use_compile_only_mode else contextlib.nullcontext():
             self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
             self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 

From 00f13331b0c0e65dc9004668b10131b8ca31c933 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Date: Mon, 9 Sep 2024 11:20:26 +0530
Subject: [PATCH 172/341] Hardcode fastapi version due to pydantic error (#255)

Fixes serving mode issue; due to error in fastapi
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 3b8d473c1fe7a..7c12fc591f8f7 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -8,7 +8,7 @@ tqdm
 py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
+fastapi == 0.112.2 # Hardcoding this to workaround issue with new fastapi.
 aiohttp
 openai
 uvicorn[standard]

From 73af823681d008554076e39d7a5f406f422745d1 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Mon, 9 Sep 2024 03:56:04 -0700
Subject: [PATCH 173/341] Eliminate graph breaks for torch.compile mode (#202)

Eliminate two graph breaks for torch.compile mode:
1. [__graph_breaks] torch._dynamo.exc.Unsupported: builtin: eq [<class
'torch._dynamo.variables.misc.GetAttrVariable'>, <class
'torch._dynamo.variables.constant.EnumVariable'>] False
2. [__graph_breaks] torch._dynamo.exc.Unsupported: Tensor.item

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>

---------

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 vllm/hpu/cache_ops.py                     | 8 ++++----
 vllm/model_executor/models/gpt_bigcode.py | 6 ++++--
 vllm/model_executor/models/llama.py       | 6 ++++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 98f109accea06..9042924f68b3d 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
+import math
+
 import habana_frameworks.torch as htorch
 import torch
 
@@ -30,8 +32,7 @@ def reshape_and_cache(key,
     # lots of padding, or are doing warmup.
     # This loop is a workaround for this issue. Please remove it
     # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = torch.div(num_slots_requested,
-                                    num_slots_available).ceil().int().item()
+    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
     for i in range(num_kv_cache_passes):
         start_idx = i * num_slots_available
         end_idx = (i + 1) * num_slots_available
@@ -58,8 +59,7 @@ def prepare_to_cache(cache, slot_mapping):
     # lots of padding, or are doing warmup.
     # This loop is a workaround for this issue. Please remove it
     # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = torch.div(num_slots_requested,
-                                    num_slots_available).ceil().int().item()
+    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
 
     return num_kv_cache_passes, num_slots_available, indices, offsets
 
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 3ae3c8c8f712c..5d4387dbb9f48 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -44,6 +44,8 @@
 
 from .interfaces import SupportsLoRA
 
+is_hpu = current_platform.is_hpu()
+
 
 class GPTBigCodeAttention(nn.Module):
 
@@ -225,13 +227,13 @@ def forward(
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
 
-        if current_platform.is_hpu():
+        if is_hpu:
             import habana_frameworks.torch as htorch
             htorch.core.mark_step()
         for i in range(len(self.h)):
             layer = self.h[i]
             hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
-            if current_platform.is_hpu():
+            if is_hpu:
                 htorch.core.mark_step()
 
         hidden_states = self.ln_f(hidden_states)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d659d0a3f1127..51716b12513d8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -55,6 +55,8 @@
 from .interfaces import SupportsLoRA
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
 
+is_hpu = current_platform.is_hpu()
+
 
 class LlamaMLP(nn.Module):
 
@@ -318,7 +320,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        if current_platform.is_hpu():
+        if is_hpu:
             import habana_frameworks.torch as htorch
             htorch.core.mark_step()
         for i in range(self.start_layer, self.end_layer):
@@ -330,7 +332,7 @@ def forward(
                 attn_metadata,
                 residual,
             )
-            if current_platform.is_hpu():
+            if is_hpu:
                 htorch.core.mark_step()
 
         if not get_pp_group().is_last_rank:

From 5cf8441311b341e60d6538c442656e48ab38d230 Mon Sep 17 00:00:00 2001
From: Dominika Olszewska <dolszewska@habana.ai>
Date: Tue, 10 Sep 2024 12:16:54 +0200
Subject: [PATCH 174/341] Port flat PA from habana_next to habana_main (#169)

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (*link existing issues this PR will resolve*)

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>

---------

Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com>
Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai>
---
 README_GAUDI.md                               |  22 +-
 .../getting_started/gaudi-installation.rst    |  14 +-
 vllm/attention/backends/habana_attn.py        | 136 ++-----
 vllm/attention/ops/habana_paged_attn.py       |  51 +--
 vllm/hpu/ops.py                               | 114 +++---
 vllm/hpu/utils.py                             |   7 +-
 vllm/worker/habana_model_runner.py            | 365 +++++++++++-------
 7 files changed, 330 insertions(+), 379 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 91bcbe49405eb..5109f7ddf9927 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -455,12 +455,12 @@ Environment variables
 -   `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment
     variables configuring ranges of bucketing mechanism
     -   `{phase}` is either `PROMPT` or `DECODE`
-    -   `{dim}` is either `BS` or `SEQ`
+    -   `{dim}` is either `BS`, `SEQ` or `BLOCK`
     -   `{param}` is either `MIN`, `STEP` or `MAX`
     -   Default values:
         - Prompt:
            -   batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-           -   batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32`
+           -   batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
            -   batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`):
                     `min(max_num_seqs, 64)`
            -   sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`):
@@ -468,20 +468,20 @@ Environment variables
            -   sequence length step
                     (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
            -   sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`):
-                    `1024`
+                    `max_model_len`
 
         - Decode:
-            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)`
             -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
-                    `128`
+                    `min(max_num_seqs, 32)`
             -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
                     `max_num_seqs`
-            -   sequence length min (`VLLM_DECODE_SEQ_BUCKET_MIN`):
-                    `block_size`
-            -   sequence length step
-                    (`VLLM_DECODE_SEQ_BUCKET_STEP`): `block_size`
-            -   sequence length max (`VLLM_DECODE_SEQ_BUCKET_MAX`):
-                    `2048`
+            -   block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`):
+                    `128`
+            -   block size step
+                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128`
+            -   block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`):
+                    `max(128, (max_num_seqs*max_model_len)/block_size)`
 
 Additionally, there are HPU PyTorch Bridge environment variables
 impacting vLLM execution:
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index b3234d10b3115..ed3beabb2c8aa 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -335,19 +335,19 @@ Environment variables
 
       - Prompt:
          - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``32``
+         - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
          - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
          - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``1024``
+         - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
 
       - Decode:
-         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
-         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``128``
+         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)``
+         - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``block_size``
-         - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``block_size``
-         - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``2048``
+         - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``128``
+         - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``128``
+         - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
 
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 2259630fa10b7..20b0f2bc7630b 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -58,58 +58,14 @@ def copy_blocks(
 
 
 @dataclass
-class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata):
-    """Metadata for HabanaAttentionbackend.
-
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
+class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata):
+    """Metadata for HabanaAttentionbackend."""
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
+    attn_bias: Optional[torch.Tensor]
     seq_lens_tensor: Optional[torch.Tensor]
 
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
-    #                                   |-- query_len ---|
-
-    # Maximum query length in the batch.
-    max_query_len: Optional[int]
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    subquery_start_loc: Optional[torch.Tensor]
-    # FIXME: It is for flash attn.
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[torch.Tensor] = None
-
 
 class HabanaAttentionImpl(AttentionImpl, torch.nn.Module):
     """
@@ -229,60 +185,48 @@ def forward(
 
         if attn_metadata.is_prompt:
             # Prompt run.
-            if kv_cache is None or attn_metadata.block_tables.numel() == 0:
-                if not self.prefill_usefusedsdpa:
-                    # TODO: move this outside of model
-                    assert attn_metadata.attn_bias is not None, \
+            if not self.prefill_usefusedsdpa:
+                # TODO: move this outside of model
+                assert attn_metadata.attn_bias is not None, \
                         'attn_bias must be set before calling model.forward!'
-                    attn_bias = attn_metadata.attn_bias
-                    if self.alibi_slopes is not None and \
-                        self.position_bias is not None:
-                        attn_bias.add_(self.position_bias[:, :,
-                                                          -attn_bias.size(2):,
-                                                          -attn_bias.size(3):])
-                else:
-                    attn_bias = None
-
-                query_shape = (batch_size, seq_len, self.num_heads,
-                               self.head_size)
-                kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
-                            self.head_size)
-                out = ops.prompt_attention(
-                    query.view(query_shape),
-                    key.view(kv_shape),
-                    value.view(kv_shape),
-                    attn_bias=attn_bias,
-                    p=0.0,
-                    scale=self.scale,
-                    matmul_qk_op=self.matmul_qk,
-                    softmax_op=self.softmax,
-                    matmul_av_op=self.matmul_av,
-                    valid_seq_lengths=attn_metadata.seq_lens_tensor,
-                )
-                output = out.reshape(batch_size, seq_len, hidden_size)
+                attn_bias = attn_metadata.attn_bias
+                if self.alibi_slopes is not None and \
+                    self.position_bias is not None:
+                    attn_bias.add_(self.position_bias[:, :,
+                                                      -attn_bias.size(2):,
+                                                      -attn_bias.size(3):])
             else:
-                # prefix-enabled attention
-                output = HabanaPagedAttention.forward_prefix(
-                    query,
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.block_tables,
-                    attn_metadata.subquery_start_loc,
-                    attn_metadata.seq_lens_tensor,
-                    attn_metadata.context_lens_tensor,
-                    attn_metadata.max_query_len,
-                    self.alibi_slopes,
-                )
+                attn_bias = None
+
+            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
+                        self.head_size)
+            out = ops.prompt_attention(
+                query.view(query_shape),
+                key.view(kv_shape),
+                value.view(kv_shape),
+                attn_bias=attn_bias,
+                p=0.0,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                softmax_op=self.softmax,
+                matmul_av_op=self.matmul_av,
+            )
+            output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
             output = HabanaPagedAttention.forward_decode(
-                query, key_cache, value_cache, attn_metadata.block_tables,
-                attn_metadata.seq_lens_tensor, self.kv_cache_dtype,
-                self.num_kv_heads, self.scale, self.position_bias, k_scale,
-                v_scale, self.matmul_qk, self.softmax, self.matmul_av,
-                self.k_cache, self.v_cache)
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_list=attn_metadata.block_list,
+                block_mapping=attn_metadata.block_mapping,
+                block_bias=attn_metadata.attn_bias,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                matmul_av_op=self.matmul_av,
+                keys_fetch_func=self.k_cache.fetch_from_cache,
+                values_fetch_func=self.v_cache.fetch_from_cache)
         # Reshape the output tensor.
         return output.view(batch_size, seq_len, hidden_size)
 
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index 9602886299c47..cab8d7abe95fd 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -16,16 +16,9 @@
 @dataclass
 class HabanaPagedAttentionMetadata:
     """Metadata for PagedAttention."""
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    seq_lens_tensor: Optional[torch.Tensor]
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
 
 
 class HabanaPagedAttention:
@@ -63,42 +56,8 @@ def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
                                     slot_mapping, kv_cache_dtype, is_prompt)
 
     @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: float,
-        v_scale: float,
-        matmul_qk_op,
-        softmax_op,
-        matmul_av_op,
-        k_cache_cls,
-        v_cache_cls,
-    ) -> torch.Tensor:
-        block_size = value_cache.shape[1]
-        return ops.paged_attention_v1(
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            seq_lens,
-            block_size,
-            alibi_slopes,
-            kv_cache_dtype,
-            matmul_qk_op,
-            softmax_op,
-            matmul_av_op,
-            k_cache_cls,
-            v_cache_cls,
-        )
+    def forward_decode(**kwargs) -> torch.Tensor:
+        return ops.flat_pa(**kwargs)
 
     @staticmethod
     def forward_prefix(
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index bacb755b39393..b2705429906c4 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
-import os
 from typing import Optional
 
 import habana_frameworks.torch as htorch
@@ -29,72 +28,57 @@
     logger.warning("Could not import HPU FusedSDPA kernel. "
                    "vLLM will use native implementation.")
 
-PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1')
-
-
-def fetch_from_cache(cache, blocks, permutations):
-    return [
-        cache.index_select(0, blocks[:, i]).permute(permutations)
-        for i in range(blocks.size(1))
-    ]
-
-
-def paged_attention_v1(query,
-                       key_cache,
-                       value_cache,
-                       head_mapping,
-                       scale,
-                       block_tables,
-                       context_lens,
-                       block_size,
-                       alibi_slopes=None,
-                       kv_cache_dtype=None,
-                       matmul_qk_op=torch.matmul,
-                       softmax_op=torch.softmax,
-                       matmul_av_op=torch.matmul,
-                       k_cache_cls=None,
-                       v_cache_cls=None) -> None:
-    seq_len = block_tables.size(1)
-    batch_size, query_heads, _ = query.shape
-    _, _, kv_heads, _ = key_cache.shape
-    min_inf = torch.finfo(query.dtype).min
-    mask = (torch.arange(0,
-                         seq_len * block_size,
-                         dtype=torch.int32,
-                         device=key_cache.device).view(1, -1).expand(
-                             batch_size, -1).ge(context_lens.view(-1, 1)).view(
-                                 batch_size, 1, 1, -1))
-    query.mul_(scale)
-    query = query.unsqueeze(-2)
-    fetch_keys = fetch_from_cache if k_cache_cls is None else \
-                 k_cache_cls.fetch_from_cache
-    keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1))
-    if query_heads != kv_heads:
+
+def batch2block(tensor, block_mapping):
+    shape = tuple(tensor.shape)
+    return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
+
+
+def block2batch(tensor, block_mapping):
+    shape = tuple(tensor.shape)
+    return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
+
+
+def block_softmax(batch_size, attn, block_mapping):
+    attn.sub_(10.0)
+    attn = attn.exp_()
+    sums = attn.sum(dim=-1).unsqueeze(-1)
+    sums = block2batch(sums, block_mapping)
+    sums = batch2block(sums, block_mapping)
+    sums.add_(1.0e-12)
+    attn.div_(sums)
+    return attn
+
+
+def flat_pa(query, key_cache, value_cache, block_list, block_mapping,
+            block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func,
+            values_fetch_func):
+    batch_size = query.size(0)
+    q_heads = query.size(1)
+    kv_heads = key_cache.size(2)
+
+    query = batch2block(scale * query, block_mapping).unsqueeze(-2)
+    key = keys_fetch_func(key_cache, block_list).transpose(1, 2)
+    value = values_fetch_func(value_cache, block_list).transpose(1, 2)
+    block_bias = block_bias.view(key.size(0), 1, 1, -1)
+
+    if kv_heads != q_heads:
+        block_bias = block_bias.unsqueeze(1)
         query = query.unflatten(1, (kv_heads, -1))
-        keys = [k.unflatten(1, (kv_heads, 1)) for k in keys]
-        mask = mask.unsqueeze(2)
-
-    attn_weights = torch.cat([matmul_qk_op(query, k) for k in keys], dim=-1)
-    if alibi_slopes is not None:
-        attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):,
-                                       -attn_weights.size(3):])
-    attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1)
-
-    fetch_values = fetch_from_cache if v_cache_cls is None else \
-                   v_cache_cls.fetch_from_cache
-    values = fetch_values(value_cache, block_tables, (0, 2, 1, 3))
-    if PA_SPLIT_VALUE:
-        attn_weights = attn_weights.split(block_size, dim=-1)
+        key = key.unflatten(1, (kv_heads, 1))
+        value = value.unflatten(1, (kv_heads, 1))
+        key = key.transpose(3, 4)
     else:
-        values = [torch.cat(values, dim=-2)]
-        attn_weights = [attn_weights]
-    if query_heads != kv_heads:
-        values = [v.unflatten(1, (kv_heads, 1)) for v in values]
-    attn_weights = [matmul_av_op(a, v) for a, v in zip(attn_weights, values)]
-    if query_heads != kv_heads:
-        attn_weights = [a.flatten(1, 2) for a in attn_weights]
-    attn_weights = sum(attn_weights)
-    return attn_weights.squeeze(-2)
+        key = key.transpose(2, 3)
+
+    attn = matmul_qk_op(query, key) + block_bias
+    attn = block_softmax(batch_size, attn, block_mapping)
+    attn = matmul_av_op(attn, value)
+    attn = block2batch(attn, block_mapping)
+    attn = attn.squeeze(-2)
+    if kv_heads != q_heads:
+        attn = attn.flatten(1, 2)
+    return attn
 
 
 def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
index 3d9c7cb1c4c22..13204b83d5742 100644
--- a/vllm/hpu/utils.py
+++ b/vllm/hpu/utils.py
@@ -57,8 +57,5 @@ def forward(self, input, cache, num_kv_cache_passes, num_slots_available,
                                block_offset)
         return cache
 
-    def fetch_from_cache(self, cache, blocks, permutations):
-        return [
-            cache.index_select(0, blocks[:, i]).permute(permutations)
-            for i in range(blocks.size(1))
-        ]
+    def fetch_from_cache(self, cache, blocks):
+        return cache.index_select(0, blocks)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a4ade587db089..a6bd5e5f68745 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -51,29 +51,47 @@
 
 logger = init_logger(__name__)
 
+_TYPE_CACHE = {}
 # These values are assumed to be zero in several places.
 # Use caution when updating them!
 _PAD_SLOT_ID = 0
 _PAD_BLOCK_ID = 0
 
 LORA_WARMUP_RANK = 8
-_TYPE_CACHE = {}
+
+
+def subtuple(obj: object,
+             typename: str,
+             to_copy: List[str],
+             to_override: Optional[Dict[str, object]] = None):
+    if obj is None:
+        return None
+    if to_override is None:
+        to_override = {}
+    fields = set(to_copy) | set(to_override.keys())
+    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if typename not in _TYPE_CACHE:
+        _TYPE_CACHE[typename] = collections.namedtuple(typename,
+                                                       ' '.join(fields))
+    return _TYPE_CACHE[typename](**values)
 
 
 def read_bucket_settings(phase: str, dim: str, **defaults):
     """Read bucketing configuration from env variables.
 
     phase is either 'prompt' or 'decode'
-    dim is either 'bs' or 'block'
+    dim is either 'bs', 'seq' or 'block'
     param is either 'min', 'step' or 'max'
     example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
     """
     params = ['min', 'step', 'max']
+    env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
+    default_values = [defaults[p] for p in params]
     values = [
-        int(
-            os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(),
-                           defaults[p])) for p in params
+        int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)
     ]
+    for e, v, d in zip(env_vars, values, defaults):
+        logger.info('%s=%s (default:%s)', e, v, d)
     return values
 
 
@@ -103,9 +121,9 @@ def warmup_range(config: Tuple[int, int, int]):
     return list(filter(lambda bucket: bucket >= bmin, buckets))
 
 
-def warmup_buckets(bs_bucket_config,
-                   seq_bucket_config,
-                   max_num_batched_tokens=None):
+def generate_prompt_buckets(bs_bucket_config,
+                            seq_bucket_config,
+                            max_num_batched_tokens=None):
     buckets = list(
         itertools.product(warmup_range(bs_bucket_config),
                           warmup_range(seq_bucket_config)))
@@ -150,6 +168,19 @@ def warmup_buckets(bs_bucket_config,
     return captured_buckets, omitted_buckets
 
 
+def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
+                            max_blocks):
+    buckets = []
+    for bs in warmup_range(bs_bucket_config):
+        for blocks in warmup_range(blocks_bucket_config):
+            if blocks < bs:
+                continue
+            if blocks > max_blocks:
+                break
+            buckets.append((bs, blocks))
+    return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
+
+
 def next_pow2(value: int, base: int):
     res = base
     while value > 1:
@@ -169,22 +200,6 @@ def find_bucket(value: int, config: Tuple[int, int, int]):
     return max(bmin, min(next_step, next_pow))
 
 
-def subtuple(obj: object,
-             typename: str,
-             to_copy: List[str],
-             to_override: Optional[Dict[str, object]] = None):
-    if to_override is None:
-        to_override = {}
-    if obj is None:
-        return None
-    fields = set(to_copy) | set(to_override.keys())
-    values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
-    if typename not in _TYPE_CACHE:
-        _TYPE_CACHE[typename] = collections.namedtuple(typename,
-                                                       ' '.join(fields))
-    return _TYPE_CACHE[typename](**values)
-
-
 def align_workers(value, op):
     group = get_world_group().cpu_group
     world_size = torch.distributed.get_world_size()
@@ -195,13 +210,19 @@ def align_workers(value, op):
     return value_t.item()
 
 
+def pad_list(list, k, v):
+    target_len = round_up(len(list), k)
+    padding = target_len - len(list)
+    return list + [v] * padding
+
+
 class HpuModelAdapter():
 
-    def __init__(self, model, enforce_eager):
+    def __init__(self, model, block_size, enforce_eager):
         self.model = model
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
-
+        self.block_size = block_size
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
@@ -225,22 +246,45 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
         mask = causal_mask.logical_or(len_mask)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
-        #FIXME: Restore sliding window support
-        #if self.sliding_window is not None:
         attn_metadata = prefill_metadata._replace(attn_bias=attn_bias)
         return attn_metadata
 
+    def _set_block_mapping(self, metadata, batch_size, device, dtype):
+        mask = torch.arange(0,
+                            self.block_size,
+                            device=device,
+                            dtype=torch.int32).unsqueeze(0)
+        mask = mask >= metadata.block_usage.unsqueeze(-1)
+        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
+            mask, -math.inf))
+        block_mapping = torch.nn.functional.one_hot(
+            metadata.block_mapping.to(torch.long),
+            num_classes=batch_size).to(dtype)
+        metadata = metadata._replace(block_mapping=block_mapping,
+                                     attn_bias=attn_bias)
+        return metadata
+
+    def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
+                         dtype):
+        if attn_metadata.is_prompt:
+            meta = attn_metadata
+            attn_metadata = self._set_attn_bias(meta, batch_size, seq_len,
+                                                device, dtype)
+        else:
+            meta = attn_metadata
+            attn_metadata = self._set_block_mapping(meta, batch_size, device,
+                                                    dtype)
+        return attn_metadata
+
     def forward(self, *args, **kwargs):
         kwargs = kwargs.copy()
         selected_token_indices = kwargs.pop('selected_token_indices')
         if 'warmup_mode' in kwargs:
             kwargs.pop('warmup_mode')
         input_ids = kwargs['input_ids']
-        kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'],
-                                                      input_ids.size(0),
-                                                      input_ids.size(1),
-                                                      input_ids.device,
-                                                      torch.bfloat16)
+        kwargs['attn_metadata'] = self._update_metadata(
+            kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
+            input_ids.device, torch.bfloat16)
         LoraMask.setLoraMask(kwargs.pop('lora_mask'))
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
@@ -536,7 +580,9 @@ def load_model(self) -> None:
             # RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = _maybe_wrap_in_hpu_graph(
-                    self.model, enforce_eager=self.enforce_eager)
+                    self.model,
+                    self.block_size,
+                    enforce_eager=self.enforce_eager)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)
 
@@ -553,73 +599,48 @@ def _is_valid_bucket(self, bucket):
         return bucket[0] * bucket[1] <= self.max_num_batched_tokens
 
     def _setup_buckets(self) -> None:
+        align_bs = lambda x: min(self.max_num_seqs, x)
         max_bucket_cfg = 64
         if self.lora_config and \
             max_bucket_cfg > self.max_num_batched_tokens // self.block_size:
             max_bucket_cfg = self.max_num_batched_tokens // self.block_size
-        self.prompt_bs_bucket_cfg = read_bucket_settings('prompt',
-                                                         'bs',
-                                                         min=1,
-                                                         step=32,
-                                                         max=min(
-                                                             self.max_num_seqs,
-                                                             max_bucket_cfg))
+        blocks_step = 128
+        #FIXME: The default values should be max_model_len
+        max_prompt_seq = 1024
+        max_decode_seq = 2048
+        self.prompt_bs_bucket_cfg = read_bucket_settings(
+            'prompt',
+            'bs',
+            min=1,
+            step=align_bs(32),
+            max=align_bs(max_bucket_cfg))
         self.decode_bs_bucket_cfg = read_bucket_settings('decode',
                                                          'bs',
-                                                         min=1,
-                                                         step=128,
+                                                         min=align_bs(32),
+                                                         step=align_bs(32),
                                                          max=self.max_num_seqs)
         self.prompt_seq_bucket_cfg = read_bucket_settings('prompt',
                                                           'seq',
                                                           min=self.block_size,
                                                           step=self.block_size,
-                                                          max=1024)
-        self.decode_seq_bucket_cfg = read_bucket_settings('decode',
-                                                          'seq',
-                                                          min=self.block_size,
-                                                          step=self.block_size,
-                                                          max=2048)
+                                                          max=max_prompt_seq)
+        self.decode_block_bucket_cfg = read_bucket_settings(
+            'decode',
+            'block',
+            min=blocks_step,
+            step=blocks_step,
+            max=max(blocks_step,
+                    self.max_num_seqs * max_decode_seq // self.block_size))
         self.graphed_buckets: Set[Any] = set()
 
         msg = ("Prompt bucket config (min, step, max_warmup) "
                f"bs:{self.prompt_bs_bucket_cfg}, "
                f"seq:{self.prompt_seq_bucket_cfg}")
         logger.info(msg)
-        self.prompt_buckets, prompt_omitted_buckets = warmup_buckets(
-            self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg,
-            self.max_num_batched_tokens)
-
-        if self.lora_config:
-            self.prompt_buckets[:] = [
-                bucket for bucket in self.prompt_buckets
-                if self._is_valid_bucket(bucket)
-            ]
-
-        msg = (f"Generated {len(self.prompt_buckets)} "
-               f"prompt buckets: {list(sorted(self.prompt_buckets))}")
-        logger.info(msg)
-
-        msg = (f"Omitted {len(prompt_omitted_buckets)} "
-               "prompt buckets due to exceeded token budget "
-               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
-        logger.info(msg)
-
-        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
-        logger.debug(msg)
 
         msg = ("Decode bucket config (min, step, max_warmup) "
                f"bs:{self.decode_bs_bucket_cfg}, "
-               f"seq:{self.decode_seq_bucket_cfg}")
-        logger.info(msg)
-        self.decode_buckets, _ = warmup_buckets(self.decode_bs_bucket_cfg,
-                                                self.decode_seq_bucket_cfg)
-        if self.lora_config:
-            self.decode_buckets[:] = [
-                bucket for bucket in self.decode_buckets
-                if self._is_valid_bucket(bucket)
-            ]
-        msg = (f"Generated {len(self.decode_buckets)} decode buckets: "
-               f"{list(sorted(self.decode_buckets))}")
+               f"block:{self.decode_block_bucket_cfg}")
         logger.info(msg)
 
     def _prepare_prompt(
@@ -735,10 +756,6 @@ def _prepare_prompt(
         real_num_seqs = len(query_lens)
         assert max_query_len > 0
 
-        context_lens_tensor = torch.tensor(context_lens,
-                                           dtype=torch.int,
-                                           device=self.device)
-
         if multi_modal_input_list:
             assert self.multimodal_config, (
                 "Multi-modal inputs are only supported by "
@@ -748,7 +765,6 @@ def _prepare_prompt(
         else:
             multi_modal_input = None
 
-        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         max_prompt_len = max(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
@@ -814,37 +830,17 @@ def _prepare_prompt(
                                             dtype=torch.long,
                                             device=self.device)
 
-        block_tables = make_tensor_with_pad(prefix_block_tables,
-                                            max_len=max_prompt_block_table_len,
-                                            pad=0,
-                                            dtype=torch.int,
-                                            device=self.device)
-
-        # Query length can be shorter than key (i.e., prompt) when prefill
-        # is chunked or prefix cached.
-        query_lens_tensor = torch.tensor(query_lens,
-                                         dtype=torch.long,
-                                         device=self.device)
-        subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                         dtype=torch.int32,
-                                         device=self.device)
         seq_lens_tensor = torch.tensor(seq_lens,
                                        dtype=torch.long,
                                        device=self.device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=self.device)
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
-            seq_lens=seq_lens,
+            block_list=None,
+            block_mapping=None,
+            block_usage=None,
+            attn_bias=None,
             seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            subquery_start_loc=subquery_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=False,
             num_prefills=real_num_seqs,
             num_prefill_tokens=sum_query_len,
             num_decode_tokens=0,
@@ -950,32 +946,50 @@ def _prepare_decode(
             s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl
         ] for sl in slot_mapping]
 
+        num_decode_tokens = sum(seq_lens)
+
+        blocks_used = [len(bt) for bt in block_tables]
+        block_list = list(itertools.chain(*block_tables))
+        block_mapping_nested: List[List[int]] = [
+            [i] * b_u for i, b_u in enumerate(blocks_used)
+        ]
+        block_mapping: List[int] = list(
+            itertools.chain.from_iterable(block_mapping_nested))
+
+        last_block = [
+            sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping)
+        ]
+        block_usage = [[self.block_size] * (b_u - 1) + [lb]
+                       for b_u, lb in zip(blocks_used, last_block)]
+        block_usage = list(itertools.chain(*block_usage))
+
+        block_bucket_size = find_bucket(len(block_list),
+                                        self.decode_block_bucket_cfg)
+        block_list = pad_list(block_list, block_bucket_size, _PAD_SLOT_ID)
+        block_mapping = pad_list(block_mapping, block_bucket_size, 0)
+        block_usage = pad_list(block_usage, block_bucket_size, 0)
+
+        block_list = torch.tensor(block_list,
+                                  dtype=torch.int,
+                                  device=self.device)
+        block_mapping = torch.tensor(block_mapping,
+                                     dtype=torch.int,
+                                     device=self.device)
+        block_usage = torch.tensor(block_usage,
+                                   dtype=torch.bfloat16,
+                                   device=self.device)
+
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=self.device)
-        num_decode_tokens = sum(seq_lens)
-        max_block_table_len = max(
-            len(block_table) for block_table in block_tables)
-        block_tables = make_tensor_with_pad(
-            block_tables,
-            max_len=max_block_table_len,
-            pad=0,
-            dtype=torch.int,
-            device=self.device,
-        )
+
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
-            seq_lens=None,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=None,
-            subquery_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=block_tables,
-            use_cuda_graph=False,
+            block_list=block_list,
+            block_mapping=block_mapping,
+            block_usage=block_usage,
+            attn_bias=None,
+            seq_lens_tensor=None,
             num_prefills=0,
             num_prefill_tokens=0,
             num_decode_tokens=num_decode_tokens,
@@ -1163,7 +1177,7 @@ def _seq_len(self, attn_metadata):
         if attn_metadata.num_prefills != 0:
             return attn_metadata.slot_mapping.size(1)
         else:
-            return attn_metadata.block_tables.size(1) * self.block_size
+            return attn_metadata.block_list.numel()
 
     def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         # NOTE(kzawora): To anyone working on this in the future:
@@ -1187,8 +1201,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         # input_hash(123) != input_hash(321)
         # input_hash("abc") != input_hash("cba")
         attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
-            'block_tables', 'seq_lens_tensor', 'attn_bias', 'slot_mapping',
-            'is_prompt'
+            'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
+            'block_usage', 'slot_mapping', 'is_prompt'
         ])
         return attention_metadata
 
@@ -1222,9 +1236,8 @@ def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
         max_batch_size = self.prompt_bs_bucket_cfg[-1]
-        max_seq_len = self.prompt_seq_bucket_cfg[-1]
-        if self.lora_config:
-            max_seq_len = self.max_num_batched_tokens // max_batch_size
+        max_seq_len = min(self.prompt_seq_bucket_cfg[-1],
+                          self.max_num_batched_tokens // max_batch_size)
 
         self.warmup_scenario(max_batch_size,
                              max_seq_len,
@@ -1277,21 +1290,34 @@ def warmup_scenario(self,
                 [0] * batch_size * seq_len,
             )
             self.set_active_loras(set(), lora_mapping)
-        seqs = [
-            self.create_dummy_seq_group_metadata(
-                i,
-                seq_len,
-                is_prompt,
-                lora_request=dummy_lora_requests_per_seq[i]
-                if dummy_lora_requests_per_seq else None)
-            for i in range(batch_size)
-        ]
+        if is_prompt:
+            seqs = [
+                self.create_dummy_seq_group_metadata(
+                    i,
+                    seq_len,
+                    is_prompt,
+                    lora_request=dummy_lora_requests_per_seq[i]
+                    if dummy_lora_requests_per_seq else None)
+                for i in range(batch_size)
+            ]
+        else:
+            # FIXME: seq_len is actually number of blocks
+            blocks = [seq_len // batch_size for _ in range(batch_size)]
+            blocks[0] += seq_len % batch_size
+            seqs = [
+                self.create_dummy_seq_group_metadata(
+                    i,
+                    b * self.block_size - 1,
+                    is_prompt,
+                    lora_request=dummy_lora_requests_per_seq[i]
+                    if dummy_lora_requests_per_seq else None)
+                for i, b in enumerate(blocks)
+            ]
         torch.hpu.synchronize()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, kv_caches, warmup_mode=True)
+            self.execute_model(inputs, kv_caches, warmup_mode=False)
             torch.hpu.synchronize()
-        self.profiler.end()
         gc.collect()
 
     def remove_all_loras(self):
@@ -1328,9 +1354,12 @@ def list_loras(self) -> Set[int]:
     def log_warmup(self, phase, i, max_i, batch_size, seq_len):
         free_mem = format_bytes(
             HabanaMemoryProfiler.current_free_device_memory())
+        dim = "num_blocks"
+        if phase == "Prompt":
+            dim = "seq_len"
         msg = (f"[Warmup][{phase}][{i+1}/{max_i}] "
                f"batch_size:{batch_size} "
-               f"seq_len:{seq_len} "
+               f"{dim}:{seq_len} "
                f"free_mem:{free_mem}")
         logger.info(msg)
 
@@ -1390,6 +1419,8 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
         phase = f'Graph/{"Prompt" if is_prompt else "Decode"}'
         graphed = list(c[:2] for c in self.graphed_buckets
                        if c[2] == is_prompt)
+        if num_candidates == 0:
+            num_candidates = 1
         msg = (f'{phase} captured:{len(graphed)} '
                f'({100 * len(graphed) / num_candidates:.1f}%) '
                f'used_mem:{format_bytes(total_mem)} '
@@ -1402,6 +1433,42 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             logger.info("Skipping warmup...")
             return
         self.profiler.start('internal', 'warmup')
+        max_blocks = kv_caches[0][0].size(0)
+
+        self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets(
+            self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg,
+            self.max_num_batched_tokens)
+        if self.lora_config:
+            self.prompt_buckets[:] = [
+                bucket for bucket in self.prompt_buckets
+                if self._is_valid_bucket(bucket)
+            ]
+
+        msg = (
+            f"Generated {len(self.prompt_buckets)} "
+            f"prompt buckets [bs, seq]: {list(sorted(self.prompt_buckets))}")
+        logger.info(msg)
+
+        msg = (f"Omitted {len(prompt_omitted_buckets)} "
+               "prompt buckets due to exceeded token budget "
+               f"(max_num_batched_tokens={self.max_num_batched_tokens})")
+        logger.info(msg)
+
+        msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
+        logger.debug(msg)
+
+        self.decode_buckets = generate_decode_buckets(
+            self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg,
+            max_blocks)
+        if self.lora_config:
+            self.decode_buckets[:] = [
+                bucket for bucket in self.decode_buckets
+                if self._is_valid_bucket(bucket)
+            ]
+        logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
+                    len(self.decode_buckets),
+                    list(sorted(self.decode_buckets)))
+
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
 

From e2c8b5ae2efd0e10aa3273ede60e263796e0a615 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 15:13:43 +0300
Subject: [PATCH 175/341] format.sh

---
 vllm/engine/arg_utils.py                   |  2 +-
 vllm/lora/layers.py                        |  6 +++---
 vllm/lora/models.py                        | 21 +++++++++++----------
 vllm/model_executor/model_loader/loader.py |  3 ++-
 vllm/platforms/__init__.py                 |  2 +-
 vllm/worker/habana_model_runner.py         | 19 ++++++++++---------
 6 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 36a9c919e8e0e..1a997b01a43c6 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -988,7 +988,7 @@ def create_engine_config(self) -> EngineConfig:
                 self.model_loader_extra_config = {}
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
-        
+
         load_device = device_config.device if self.weights_load_device is None else \
                  self.weights_load_device
         load_config = self.create_load_config(load_device)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index d2ef97e50fcbd..13a6813346f8b 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -349,9 +349,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         else:
             # Embedding layer only need expand op
             self.punica_wrapper.add_expand(full_output,
-                                        full_lora_a_embeddings,
-                                        self.lora_b_stacked,
-                                        add_input=True)
+                                           full_lora_a_embeddings,
+                                           self.lora_b_stacked,
+                                           add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 30f29ae6963e9..f21b45657c993 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -431,18 +431,19 @@ def __init__(
                                             dtype=torch.long,
                                             device=get_device())
             self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                            dtype=torch.long,
-                                            device=get_device())
-            self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
-                                                    dtype=torch.long,
-                                                    device=get_device())
+                                               dtype=torch.long,
+                                               device=get_device())
+            self.sampler_indices_padded = torch.empty(
+                self.max_num_batched_tokens,
+                dtype=torch.long,
+                device=get_device())
             self.embeddings_indices = torch.empty(2,
-                                                self.max_num_batched_tokens,
-                                                dtype=torch.long,
-                                                device=get_device())
+                                                  self.max_num_batched_tokens,
+                                                  dtype=torch.long,
+                                                  device=get_device())
             self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
-                                                dtype=torch.long,
-                                                device=get_device())
+                                                 dtype=torch.long,
+                                                 device=get_device())
         else:
             self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
                                                 max_batches=self.max_num_seqs,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 37d392872b0e3..0cb373441f869 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -355,7 +355,8 @@ def load_model(self, *, model_config: ModelConfig,
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(self.load_config.device):
                 model = _initialize_model(model_config, self.load_config,
-                                          lora_config, cache_config, scheduler_config)
+                                          lora_config, cache_config,
+                                          scheduler_config)
             logger.info("Loading weights on %s ...", self.load_config.device)
             model.load_weights(
                 self._get_weights_iterator(model_config.model,
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 458fc85237aa2..983399af550a9 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -47,7 +47,7 @@
     from importlib import util
     is_hpu = util.find_spec('habana_frameworks') is not None
 except Exception:
-    pass    
+    pass
 
 if is_tpu:
     # people might install pytorch built with cuda but run on tpu
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 448dd9d876690..6f4d449880ef5 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -527,14 +527,13 @@ def load_model(self) -> None:
             htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
-                self.model = get_model(
-                    model_config=self.model_config,
-                    device_config=self.device_config,
-                    load_config=self.load_config,
-                    lora_config=self.lora_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    cache_config=self.cache_config)
+                self.model = get_model(model_config=self.model_config,
+                                       device_config=self.device_config,
+                                       load_config=self.load_config,
+                                       lora_config=self.lora_config,
+                                       parallel_config=self.parallel_config,
+                                       scheduler_config=self.scheduler_config,
+                                       cache_config=self.cache_config)
             msg = ("Pre-loading model weights on "
                    f"{next(self.model.parameters()).device} "
                    f"took {m_getmodel.get_summary_string()}")
@@ -1224,7 +1223,7 @@ def create_dummy_seq_group_metadata(self,
             block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
         prompt_token_ids = [0] * input_len
         output_token_ids = [1] * output_len
-        prompt_token_ids_array = array('l', [1,3,5,7,9])  # noqa: F821
+        prompt_token_ids_array = array('l', [1, 3, 5, 7, 9])  # noqa: F821
         seq_data = SequenceData(prompt_token_ids_array)
         seq_data.output_token_ids = output_token_ids
         return SequenceGroupMetadata(request_id=str(group_id),
@@ -1738,6 +1737,7 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
         seen = cfg in self.seen_configs
         self.seen_configs.add(cfg)
         if not seen and not warmup_mode:
+            import pdb; pdb.set_trace()
             phase = 'prompt' if is_prompt else 'decode'
             logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
                            phase, batch_size, seq_len)
@@ -1776,6 +1776,7 @@ def execute_model(
         batch_size = input_tokens.size(0)
         seq_len = self._seq_len(attn_metadata)
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
+        import pdb; pdb.set_trace()
         self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
         execute_model_kwargs = {
             "input_ids": input_tokens,

From 41941953da5d747627bb42f8bf5541b984153c1d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 15:56:50 +0300
Subject: [PATCH 176/341] i did not drink my afternoon coffee and made an
 oopsie

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6f4d449880ef5..d8f87b8845821 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1223,7 +1223,7 @@ def create_dummy_seq_group_metadata(self,
             block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
         prompt_token_ids = [0] * input_len
         output_token_ids = [1] * output_len
-        prompt_token_ids_array = array('l', [1, 3, 5, 7, 9])  # noqa: F821
+        prompt_token_ids_array = array('l', prompt_token_ids)  # noqa: F821
         seq_data = SequenceData(prompt_token_ids_array)
         seq_data.output_token_ids = output_token_ids
         return SequenceGroupMetadata(request_id=str(group_id),

From 4052bdb728ba3bbddca82af1a71574c8db706179 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 15:04:34 +0200
Subject: [PATCH 177/341] Add disable_tensor_cache=True to HPUGraph capture
 (#252)

RuntimeErrors are not observed anymore on habana_main when
disable_tensor_cache is used. This PR enables disable_tensor_cache.
---
 vllm/worker/habana_model_runner.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a6bd5e5f68745..dfc2ee152076f 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -576,8 +576,6 @@ def load_model(self) -> None:
                 htcore.mark_step()
             torch.hpu.synchronize()
 
-            # FIXME: Running with disable_tensor_cache=True causes
-            # RuntimeErrors. This needs to be debugged
             with HabanaMemoryProfiler() as m_wrap:
                 self.model = _maybe_wrap_in_hpu_graph(
                     self.model,
@@ -1576,10 +1574,9 @@ def mem_margin(self, value):
 
 
 def _maybe_wrap_in_hpu_graph(*args, **kwargs):
-    return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(
-        *args, **
-        kwargs)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
-            *args, **kwargs)
+    return htorch.hpu.wrap_in_hpu_graph(
+        HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+    ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
 
 
 class HabanaProfilerCounterHelper():

From c9bf9081ee51c793e16bfcda17f5ef68db369b68 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 16:41:25 +0300
Subject: [PATCH 178/341] do not build core ext on hpu

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ec5bc24a1b834..c0f5ec984a408 100644
--- a/setup.py
+++ b/setup.py
@@ -301,7 +301,8 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
+    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu()
+                or _is_hpu())
 
 
 def get_hipcc_rocm_version():

From 69df1e7e3f6b580945ce0d0cab88233829dae205 Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Tue, 10 Sep 2024 15:43:20 +0200
Subject: [PATCH 179/341] Fix dispersed slots (#261)

On habana_main the slots are calculated by adding an offset to the block
which breaks the check for _PAD_SLOT_ID. Reworked it so that in case of
_PAD_BLOCK_ID we're automatically inserting the right value.
---
 vllm/worker/habana_model_runner.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index dfc2ee152076f..8d6c386a9975e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -887,6 +887,9 @@ def _prepare_decode(
                               self.lora_config.max_lora_rank,
                               dtype=self.lora_config.lora_dtype)
 
+        dummy_slots = itertools.cycle(
+            range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
+
         for seq_group_metadata in seq_group_metadata_list:
             assert not seq_group_metadata.is_prompt
             assert seq_group_metadata.token_chunk_size == 1
@@ -916,8 +919,11 @@ def _prepare_decode(
 
                 block_table = seq_group_metadata.block_tables[seq_id]
                 block_number = block_table[position // self.block_size]
-                block_offset = position % self.block_size
-                slot = block_number * self.block_size + block_offset
+                if block_number == _PAD_BLOCK_ID:
+                    slot = next(dummy_slots)
+                else:
+                    block_offset = position % self.block_size
+                    slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
                 lora_index_mapping.append(lora_id)
                 lora_prompt_mapping.append(lora_id)
@@ -938,12 +944,6 @@ def _prepare_decode(
                                        dtype=torch.long,
                                        device=self.device)
 
-        dummy_slots = itertools.cycle(
-            range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
-        slot_mapping = [[
-            s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl
-        ] for sl in slot_mapping]
-
         num_decode_tokens = sum(seq_lens)
 
         blocks_used = [len(bt) for bt in block_tables]

From 53f96b784980b60ca12418b39c4785210931fb09 Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jkaniecki@habana.ai>
Date: Tue, 10 Sep 2024 15:53:11 +0200
Subject: [PATCH 180/341] Skip compilation warnings during warmup phase (#262)

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 8d6c386a9975e..b6218f3cc4cfb 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1314,7 +1314,7 @@ def warmup_scenario(self,
         torch.hpu.synchronize()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, kv_caches, warmup_mode=False)
+            self.execute_model(inputs, kv_caches, warmup_mode=True)
             torch.hpu.synchronize()
         gc.collect()
 

From d436d387e1641c146974573332621dbed9266e8b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 17:21:23 +0300
Subject: [PATCH 181/341] fix tensor parallelism

---
 vllm/executor/ray_habana_executor.py | 12 ++++++------
 vllm/worker/habana_model_runner.py   | 24 +++++++++++++-----------
 vllm/worker/habana_worker.py         | 20 ++++++++++++--------
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 17e3414a96b57..d69a85a816636 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -9,11 +9,11 @@
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
-from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (_run_task_with_lock,
-                        error_on_invalid_device_count_status,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.sequence import ExecuteModelRequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -188,7 +188,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        error_on_invalid_device_count_status()
+        #        error_on_invalid_device_count_status()
 
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d8f87b8845821..c7315eb804283 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -23,16 +23,18 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SchedulerConfig)
+                         ModelConfig, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
 from vllm.hpu.ops import LoraMask as LoraMask
+from vllm.inputs.registry import InputRegistry
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal.registry import MultiModalRegistry
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
@@ -468,20 +470,26 @@ def __init__(
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
-        load_config: LoadConfig,
         cache_config: CacheConfig,
+        load_config: LoadConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_dtype: Optional[str] = "auto",
         is_driver_worker: bool = False,
-        multimodal_config: Optional[MultiModalConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+        return_hidden_states: bool = False,
+        observability_config: Optional[ObservabilityConfig] = None,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
         self.lora_config = lora_config
         self.load_config = load_config
-        self.cache_config = cache_config
         self.is_driver_worker = is_driver_worker
+        self.prompt_adapter_config = prompt_adapter_config
+        self.return_hidden_states = return_hidden_states
+        self.observability_config = observability_config
         self.profiler = Profiler()
 
         self.sliding_window = (model_config.get_sliding_window()
@@ -499,7 +507,6 @@ def __init__(
 
         self.pin_memory = is_pin_memory_available()
         self.kv_cache_dtype = kv_cache_dtype
-        self.multimodal_config = multimodal_config
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_num_attention_heads(self.parallel_config),
@@ -757,9 +764,6 @@ def _prepare_prompt(
         assert max_query_len > 0
 
         if multi_modal_input_list:
-            assert self.multimodal_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
             multi_modal_input = torch.cat(multi_modal_input_list,
                                           dim=0).to(self.device)
         else:
@@ -1737,7 +1741,6 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
         seen = cfg in self.seen_configs
         self.seen_configs.add(cfg)
         if not seen and not warmup_mode:
-            import pdb; pdb.set_trace()
             phase = 'prompt' if is_prompt else 'decode'
             logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
                            phase, batch_size, seq_len)
@@ -1776,7 +1779,6 @@ def execute_model(
         batch_size = input_tokens.size(0)
         seq_len = self._seq_len(attn_metadata)
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
-        import pdb; pdb.set_trace()
         self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
         execute_model_kwargs = {
             "input_ids": input_tokens,
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 9d083915041fe..407c618a9d597 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -4,15 +4,15 @@
 
 import gc
 import os
-from typing import List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple, Type
 
 import habana_frameworks.torch as htorch  # noqa:F401
 import torch
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
+                         ModelConfig, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)
@@ -24,6 +24,7 @@
 from vllm.utils import HabanaMemoryProfiler, format_bytes
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
+from vllm.worker.model_runner_base import ModelRunnerBase
 from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 logger = init_logger(__name__)
@@ -49,13 +50,15 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         lora_config: Optional[LoRAConfig] = None,
-        multimodal_config: Optional[MultiModalConfig] = None,
         speculative_config: Optional[SpeculativeConfig] = None,
         prompt_adapter_config: Optional[PromptAdapterConfig] = None,
         is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[ModelRunnerBase]] = None,
+        observability_config: Optional[ObservabilityConfig] = None,
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
         self.scheduler_config = scheduler_config
         self.device_config = device_config
         self.cache_config = cache_config
@@ -64,6 +67,7 @@ def __init__(
         self.distributed_init_method = distributed_init_method
         self.lora_config = lora_config
         self.load_config = load_config
+        self.prompt_adapter_config = prompt_adapter_config
         self.is_driver_worker = is_driver_worker
         if self.is_driver_worker:
             assert self.rank == 0, "The driver worker must have rank 0."
@@ -72,19 +76,19 @@ def __init__(
             # note: lazy import to avoid importing torch before initializing
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
-        self.multimodal_config = multimodal_config
 
         self.model_runner: HabanaModelRunner = HabanaModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
             device_config,
-            cache_config=cache_config,
+            cache_config,
             load_config=load_config,
             lora_config=self.lora_config,
             kv_cache_dtype=self.cache_config.cache_dtype,
-            multimodal_config=self.multimodal_config,
-            is_driver_worker=is_driver_worker)
+            is_driver_worker=is_driver_worker,
+            prompt_adapter_config=prompt_adapter_config,
+            observability_config=observability_config)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]

From 61b6fbb15d7871ee95dbb6bdea6021458b8550f9 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 10 Sep 2024 17:25:38 +0300
Subject: [PATCH 182/341] add missing functions

---
 vllm/lora/layers.py |  4 ++--
 vllm/lora/models.py | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 13a6813346f8b..9e4a0098dc44e 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -93,7 +93,7 @@ def _apply_lora(
     x = x.view(-1, x.shape[-1])
     output = output.view(-1, output.shape[-1])
     indices = indices.view(-1)
-    if is_hpu():
+    if current_platform.is_hpu():
         dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked,
                              indices, 0, 1.0)
     else:
@@ -314,7 +314,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             # NOTE(vgoel): These asserts can be skipped when upstreaming.
             # Can be removed from vllm-fork also once lora functionality
             # on Gaudi stabilizes.
-            if is_hpu():
+            if current_platform.is_hpu():
                 emb_len = embedding_len
                 x_shape = x.shape
                 ind_shape = self.embeddings_indices[1].shape
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f21b45657c993..e3abf0fc96196 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import safetensors.torch
 import torch
@@ -94,9 +94,10 @@ def convert_mapping(
     embedding_indices = index_mapping_indices.copy()
     lora_indices = index_mapping_indices.copy()
     long_lora_offsets: Optional[torch.Tensor] = None
+    device = "hpu" if current_platform.is_hpu() else "cuda"
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=get_device(),
+                                        device=device,
                                         dtype=torch.long)
     prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
@@ -121,9 +122,9 @@ def convert_mapping(
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=get_device())
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
     prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device=get_device(),
+                                         device=device,
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
@@ -134,10 +135,10 @@ def convert_mapping(
     sampler_indices = prompt_mapping_tensor
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (torch.arange(
-        0, len(sampler_indices_padded), device=get_device(), dtype=torch.long)
-                              + (sampler_indices_padded *
-                                 len(sampler_indices_padded)))
+    sampler_indices_padded = (
+        torch.arange(
+            0, len(sampler_indices_padded), device=device, dtype=torch.long) +
+        (sampler_indices_padded * len(sampler_indices_padded)))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:

From 2091161b4a2e3acaa531d1a1a3c0cba65bb50b21 Mon Sep 17 00:00:00 2001
From: Agata Dobrzyniewicz
 <160237065+adobrzyniewicz-habana@users.noreply.github.com>
Date: Wed, 11 Sep 2024 10:15:09 +0200
Subject: [PATCH 183/341] Port PT Profiler to habana_main (#256)

Porting PT Profiler from:

https://github.com/HabanaAI/vllm-fork/commit/81a23a708195faef6167919890cefa225a721907
and

https://github.com/HabanaAI/vllm-fork/commit/e805b885d32a749d9409f13b6446895d13e8b885
---
 vllm/worker/habana_model_runner.py | 46 ++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index b6218f3cc4cfb..2360e39fcba28 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -210,6 +210,26 @@ def align_workers(value, op):
     return value_t.item()
 
 
+def setup_profiler():
+    schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1)
+    DEVICE = 'hpu'
+    activities = [torch.profiler.ProfilerActivity.CPU]
+    activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE ==
+                      'hpu' else [])
+    #from habana_frameworks.torch.activity_profiler import DebugActivity
+    #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS]
+
+    profiler = torch.profiler.profile(
+        schedule=schedule,
+        activities=activities,
+        #debug_activities=debug_activities,
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('.',
+                                                                use_gzip=True),
+        record_shapes=False,
+        with_stack=True)
+    return profiler
+
+
 def pad_list(list, k, v):
     target_len = round_up(len(list), k)
     padding = target_len - len(list)
@@ -1237,11 +1257,7 @@ def profile_run(self) -> None:
         max_seq_len = min(self.prompt_seq_bucket_cfg[-1],
                           self.max_num_batched_tokens // max_batch_size)
 
-        self.warmup_scenario(max_batch_size,
-                             max_seq_len,
-                             True,
-                             kv_caches,
-                             is_profile_run=True)
+        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches)
         return
 
     def warmup_scenario(self,
@@ -1281,7 +1297,7 @@ def warmup_scenario(self,
                     for idx in range(max_num_seqs)
                 ]
         self.profiler.start('internal', scenario_name)
-        times = 3 if use_graphs else 1
+        times = 3 if use_graphs or is_profile_run else 1
         if self.lora_config and not is_profile_run:
             lora_mapping = LoRAMapping(
                 [0] * batch_size * seq_len,
@@ -1312,10 +1328,19 @@ def warmup_scenario(self,
                 for i, b in enumerate(blocks)
             ]
         torch.hpu.synchronize()
+        profiler = None
+        if is_profile_run and self.is_driver_worker:
+            profiler = setup_profiler()
+            profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
             self.execute_model(inputs, kv_caches, warmup_mode=True)
             torch.hpu.synchronize()
+            if profiler:
+                profiler.step()
+        if profiler:
+            profiler.stop()
+        self.profiler.end()
         gc.collect()
 
     def remove_all_loras(self):
@@ -1427,6 +1452,15 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem):
 
     @torch.inference_mode()
     def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
+        if profile := os.environ.get('VLLM_PT_PROFILE', None):
+            phase, bs, seq_len, graph = profile.split('_')
+            is_prompt = phase == 'prompt'
+            graphs = graph == 't'
+            if graphs:
+                self.graphed_buckets.add((int(bs), int(seq_len), is_prompt))
+            self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
+                                 True)
+            raise AssertionError("Finished profiling")
         if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true':
             logger.info("Skipping warmup...")
             return

From 68e0f57b83995f8eae67986a8f12b77e24bb2789 Mon Sep 17 00:00:00 2001
From: Krzysztof Wisniewski <kwisniewski@habana.ai>
Date: Fri, 6 Sep 2024 14:20:30 +0300
Subject: [PATCH 184/341] Reduce frequency of garbage collector

---
 vllm/worker/habana_model_runner.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 2360e39fcba28..fdf1e9c444406 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -539,6 +539,21 @@ def __init__(
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
 
+        
+        # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
+        # for comprehensive description of gc generations.
+        # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority)
+        # to set particular generation threshold or use simpler
+        # VLLM_GC_THR_MULTIPLIER to multiply default values.
+        default_gc_thrs = list(gc.get_threshold())
+        requested_gc_thrs = [None] * len(default_gc_thrs)
+        for i in range(len(default_gc_thrs)):
+            requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
+        if requested_gc_thrs == default_gc_thrs:
+            gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2))
+            requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs]
+        gc.set_threshold(*requested_gc_thrs)
+
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
         if self.model_config.quantization == 'inc':

From b776d5e8fa287018e7e373e6588f2d15176e0d72 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Wed, 11 Sep 2024 12:49:20 +0300
Subject: [PATCH 185/341] Fix LoRA test by handling mask creation inside the
 test

---
 tests/lora/test_lora_hpu.py | 93 +++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index ddbab66e166b3..01b6472745e1c 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from vllm.hpu.ops import LoraMask
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
 
 from .utils import DummyLoRAManager
@@ -19,7 +20,19 @@
     torch.float16: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-MAX_LORAS = 8
+
+
+def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank,
+                   lora_dtype):
+    indices = indices.view(-1, 1)
+    mask = torch.arange(max_loras * max_lora_rank, device=indices.device)
+    mask = mask.view(1, -1)
+    mask = ((mask >= ((indices) * max_lora_rank)) *
+            (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype)
+    mask = mask.view(batch_size, 1,
+                     -1).expand(batch_size, seq_len,
+                                -1).reshape(batch_size * seq_len, -1)
+    return mask
 
 
 @pytest.mark.parametrize("m", TENSOR_SIZES)
@@ -39,32 +52,40 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
     input = torch.rand(k, n, device="hpu", dtype=dtype)
     expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
 
-    lora_a_stack = torch.zeros(MAX_LORAS + 1,
+    lora_a_stack = torch.zeros(8,
                                1,
                                lora.lora_a.shape[1],
                                lora.lora_a.shape[0],
                                device="hpu",
                                dtype=dtype)
-    lora_b_stack = torch.zeros(MAX_LORAS + 1,
+    lora_b_stack = torch.zeros(8,
                                1,
                                lora.lora_b.shape[1],
                                lora.lora_b.shape[0],
                                device="hpu",
                                dtype=dtype)
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stack.shape[0]):
         lora_a_stack[i][0] = lora.lora_a.T
         lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
 
     output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"),
-                output)
+    indices = torch.randint(0,
+                            lora_a_stack.shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora(input, lora_a_stack, lora_b_stack, indices, output)
+
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.full((len(input), ), -1, device="hpu"), output)
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora(input, lora_a_stack, lora_b_stack, indices, output)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -99,7 +120,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
                          dim=1)
 
     lora_a_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_1.lora_a.shape[1],
                     lora_1.lora_a.shape[0],
@@ -107,31 +128,38 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
                     dtype=dtype) for i in range(2)
     ]
     lora_b_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_1.lora_b.shape[1],
                     lora_1.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype) for i in range(2)
     ]
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stacks[0].shape[0]):
         lora_a_stacks[0][i][0] = lora_1.lora_a.T
         lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
         lora_a_stacks[1][i][0] = lora_2.lora_a.T
         lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
 
     output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
-        (m // 2, m // 2))
+    indices = torch.randint(0,
+                            lora_a_stacks[0].shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices,
+                              output, (m // 2, m // 2))
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="hpu"),
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices,
                               output, (m // 2, m // 2))
     assert torch.allclose(torch.zeros_like(output), output)
 
@@ -166,14 +194,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
                          dim=1)
 
     lora_a_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_q.lora_a.shape[1],
                     lora_q.lora_a.shape[0],
                     device="hpu",
                     dtype=dtype)
     ] + [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_k.lora_a.shape[1],
                     lora_k.lora_a.shape[0],
@@ -181,21 +209,21 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
                     dtype=dtype) for i in range(2)
     ]
     lora_b_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_q.lora_b.shape[1],
                     lora_q.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype)
     ] + [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_k.lora_b.shape[1],
                     lora_k.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype) for i in range(2)
     ]
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stacks[0].shape[0]):
         lora_a_stacks[0][i][0] = lora_q.lora_a.T
         lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
         lora_a_stacks[1][i][0] = lora_k.lora_a.T
@@ -204,17 +232,24 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
         lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
 
     output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
-        (qkv[0], qkv[1], qkv[2]))
+    indices = torch.randint(0,
+                            lora_a_stacks[0].shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices,
+                              output, (qkv[0], qkv[1], qkv[2]))
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="hpu"),
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices,
                               output, (qkv[0], qkv[1], qkv[2]))
     assert torch.allclose(torch.zeros_like(output), output)
 

From f858d4359657db1ea01f39e8a8b39ec68076d6a6 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:57:03 +0530
Subject: [PATCH 186/341] Attn MetaData dtype should be same as model dtype
 (#271)

Attn MetaData was hard coded to bfloat16, leading to a runtime error for
float32 model instantiation.
---
 vllm/worker/habana_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 2360e39fcba28..55f205915ea8c 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -238,11 +238,12 @@ def pad_list(list, k, v):
 
 class HpuModelAdapter():
 
-    def __init__(self, model, block_size, enforce_eager):
+    def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
         self.block_size = block_size
+        self.dtype = dtype
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
@@ -304,7 +305,7 @@ def forward(self, *args, **kwargs):
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
-            input_ids.device, torch.bfloat16)
+            input_ids.device, self.dtype)
         LoraMask.setLoraMask(kwargs.pop('lora_mask'))
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
@@ -600,6 +601,7 @@ def load_model(self) -> None:
                 self.model = _maybe_wrap_in_hpu_graph(
                     self.model,
                     self.block_size,
+                    dtype=self.model_config.dtype,
                     enforce_eager=self.enforce_eager)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)

From acf7d548ee0352c5482d0c424ddb4a0558007ef7 Mon Sep 17 00:00:00 2001
From: Dudi Lester <160421192+dudilester@users.noreply.github.com>
Date: Thu, 12 Sep 2024 11:42:31 +0300
Subject: [PATCH 187/341] Support Mixtral quantization using INC (#267)

---
 vllm/hpu/ops.py                               | 88 ++++++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py | 42 ++++++---
 .../model_executor/layers/quantization/inc.py |  6 +-
 vllm/model_executor/model_loader/utils.py     |  2 +-
 4 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index b2705429906c4..3d76c36f2648b 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -86,36 +86,6 @@ def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     return F.silu(x[..., :d]) * x[..., d:]
 
 
-def static_fused_moe(hidden_states, w1, w2, score, topk):
-    B, D = hidden_states.shape
-    num_experts = w1.shape[0]
-    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-    routing_weights, selected_experts = torch.topk(routing_weights,
-                                                   topk,
-                                                   dim=-1)
-    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-    routing_weights = routing_weights.to(hidden_states.dtype)
-    final_hidden_states = torch.zeros((1, B, D),
-                                      dtype=hidden_states.dtype,
-                                      device=hidden_states.device)
-    padded_weights = torch.zeros((B, num_experts),
-                                 dtype=hidden_states.dtype,
-                                 device=hidden_states.device)
-    padded_weights.scatter_(-1, selected_experts, routing_weights)
-    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
-    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-
-    htorch.core.mark_step()
-
-    for expert_idx in range(num_experts):
-        w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1))
-        w_output = silu_and_mul(w_output)
-        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
-        final_hidden_states += w_output * padded_weights[expert_idx]
-
-    return final_hidden_states.view(-1, D)
-
-
 #TODO: remove after fusedsdpa fix for query_head != kv_head
 def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -252,3 +222,61 @@ def dispatch_bgmv_embedding(
     wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
     out = x @ wb
     y += out * scale
+
+
+class MoeMatmul(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def set_weight(self, w):
+        self.weight = w
+
+    def calc(self, state, expert_id, w):
+        self.weight = w[expert_id].transpose(0, 1)
+        return self.forward(state)
+
+    def forward(self, state):
+        return torch.matmul(state, self.weight)
+
+
+class StaticFusedMOE(torch.nn.Module):
+
+    def __init__(self, num_total_experts):
+        super().__init__()
+        self.w13_list = torch.nn.ModuleList(
+            [MoeMatmul() for _ in range(num_total_experts)])
+        self.w2_list = torch.nn.ModuleList(
+            [MoeMatmul() for _ in range(num_total_experts)])
+        self.num_total_experts = num_total_experts
+
+    def forward(self, hidden_states, w1, w2, score, topk):
+        B, D = hidden_states.shape
+        routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       topk,
+                                                       dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros((1, B, D),
+                                          dtype=hidden_states.dtype,
+                                          device=hidden_states.device)
+        padded_weights = torch.zeros((B, self.num_total_experts),
+                                     dtype=hidden_states.dtype,
+                                     device=hidden_states.device)
+        padded_weights.scatter_(-1, selected_experts, routing_weights)
+        padded_weights = padded_weights.reshape(-1, B, self.num_total_experts)
+        padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+        htorch.core.mark_step()
+
+        for expert_idx in range(self.num_total_experts):
+            padded_weight = padded_weights[expert_idx]
+            current_state_static = hidden_states.reshape(-1, D)
+            w_output = self.w13_list[expert_idx].calc(current_state_static,
+                                                      expert_idx, w1)
+            w_output = silu_and_mul(w_output)
+            w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2)
+            current_hidden_states_static = w_output * padded_weight
+            final_hidden_states += current_hidden_states_static
+
+        return final_hidden_states.view(-1, D)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b49bf40d4746e..cf0d5f98f1b01 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,9 +13,6 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import is_hpu
 
-if is_hpu():
-    from vllm.hpu.ops import static_fused_moe
-
 logger = init_logger(__name__)
 
 
@@ -78,7 +75,8 @@ def apply(
     ) -> torch.Tensor:
         return self.forward(x, layer.w13_weight, layer.w2_weight,
                             router_logits, top_k, renormalize,
-                            use_grouped_topk, num_expert_group, topk_group)
+                            use_grouped_topk, num_expert_group, topk_group,
+                            layer)
 
     def forward_cuda(
         self,
@@ -91,6 +89,7 @@ def forward_cuda(
         use_grouped_topk: bool,
         num_expert_group: Optional[int],
         topk_group: Optional[int],
+        layer: Optional[torch.nn.Module],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
         return fused_moe(x,
@@ -104,15 +103,25 @@ def forward_cuda(
                          num_expert_group=num_expert_group,
                          topk_group=topk_group)
 
-    def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-                    router_logits: torch.Tensor, top_k: int, renormalize: bool,
-                    use_grouped_topk: bool, num_expert_group: Optional[int],
-                    topk_group: Optional[int]):
+    def forward_hpu(
+        self,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
+        layer: Optional[torch.nn.Module],
+    ):
         assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
         assert num_expert_group is None, ('num_expert_group is '
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
-        return static_fused_moe(x, w1, w2, router_logits, top_k)
+        if layer is not None:
+            return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k)
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
@@ -129,6 +138,7 @@ def forward_tpu(
         use_grouped_topk: bool,
         num_expert_group: Optional[int],
         topk_group: Optional[int],
+        layer: Optional[torch.nn.Module],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
         assert not use_grouped_topk
@@ -140,7 +150,7 @@ def forward_tpu(
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj / 
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -191,6 +201,9 @@ def __init__(
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
+        if is_hpu():
+            from vllm.hpu.ops import StaticFusedMOE
+            self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -245,13 +258,22 @@ def weight_loader(self, param: torch.nn.Parameter,
             if shard_id == 0:
                 param_data[expert_id,
                            0:shard_size, :] = loaded_weight[shard, :]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                        param_data[expert_id])
             # w3, up_proj case: Load into second shard of w13.
             elif shard_id == 2:
                 param_data[expert_id, shard_size:2 *
                            shard_size, :] = loaded_weight[shard, :]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                        param_data[expert_id])
             # w2, down_proj case: Load into only shard of w2.
             elif shard_id == 1:
                 param_data[expert_id, :, :] = loaded_weight[:, shard]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
+                        param_data[expert_id])
             else:
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
index f6718ec2ac9e7..ec0141b61f58f 100644
--- a/vllm/model_executor/layers/quantization/inc.py
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -5,6 +5,8 @@
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -52,6 +54,8 @@ def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["INCLinearMethod"]:
         if isinstance(layer, LinearBase):
             return INCLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return UnquantizedFusedMoEMethod()
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -78,7 +82,7 @@ class INCLinearMethod(LinearMethodBase):
     1. Only support per-tensor quantization due to torch._scaled_mm support.
     2. Only support float8_e4m3fn data type due to the limitation of
        torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
-       
+
     Args:
         quant_config: The quantization config.
     """
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f7e0f56c1a46e..a8b0a7b07ed8e 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,7 +24,7 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     if (model_config.quantization is not None
-            and model_config.quantization != "fp8"
+            and model_config.quantization not in ["fp8", "inc"]
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 

From 6a734f4d2b14040b3bbcd8cb9843fac9dfc8318b Mon Sep 17 00:00:00 2001
From: Ilia Taraban <tarabanil@gmail.com>
Date: Thu, 12 Sep 2024 11:51:05 +0200
Subject: [PATCH 188/341] Fixed ALiBi (#254)

Fixed ALiB and [MPT-7B](https://www.databricks.com/blog/mpt-7b) model.
Accuracy results comparing to CPU(collected using
[EleutherAI](https://github.com/EleutherAI/lm-evaluation-harness))

| Tasks          | CPU    | HPU    |
| -------------- | ------ | ------ |
| arc_challenge  | 0.4224 | 0.4189 |
| arc_easy       | 0.6974 | 0.6999 |
| hellaswag      | 0.7603 | 0.7626 |
| lambada_openai | 0.7306 | 0.7326 |
| mmlu           | 0.293  | 0.2925 |
| winogrande     | 0.6851 | 0.6811 |
---
 vllm/attention/backends/habana_attn.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 20b0f2bc7630b..56b71a431aca7 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -108,17 +108,10 @@ def __init__(
         self.v_cache = VLLMKVCache()
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
         self.sliding_window = sliding_window
-        self.position_bias = None
         self.alibi_slopes = alibi_slopes
         if alibi_slopes is not None:
-            # FIXME(kzawora): Need a general method to set max_seq_len on
-            # per-model basis.
             alibi_slopes_tensor = torch.tensor(alibi_slopes,
                                                dtype=torch.bfloat16)
-            self.position_bias = _make_alibi_bias(alibi_slopes_tensor,
-                                                  num_kv_heads,
-                                                  alibi_slopes_tensor.dtype,
-                                                  max_seq_len)
             self.alibi_slopes = alibi_slopes_tensor
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -190,11 +183,13 @@ def forward(
                 assert attn_metadata.attn_bias is not None, \
                         'attn_bias must be set before calling model.forward!'
                 attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None and \
-                    self.position_bias is not None:
-                    attn_bias.add_(self.position_bias[:, :,
-                                                      -attn_bias.size(2):,
-                                                      -attn_bias.size(3):])
+                if self.alibi_slopes is not None:
+                    position_bias = _make_alibi_bias(self.alibi_slopes,
+                                                     self.num_kv_heads,
+                                                     attn_bias.dtype,
+                                                     attn_bias.shape[-1])
+                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
+                    attn_bias.add_(position_bias)
             else:
                 attn_bias = None
 

From 543bb6dcd0626394e671dd82c1fbb0d15f9f3341 Mon Sep 17 00:00:00 2001
From: Dominika Olszewska <dolszewska@habana.ai>
Date: Thu, 12 Sep 2024 12:48:09 +0200
Subject: [PATCH 189/341] Update gaudi-installation.rst (#279)

Fixing ENV variables' names after flat-PA merge
---
 docs/source/getting_started/gaudi-installation.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index ed3beabb2c8aa..27bc0186675f1 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -329,7 +329,7 @@ Environment variables
 -   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
     - ``{phase}`` is either ``PROMPT`` or ``DECODE``
-    - ``{dim}`` is either ``BS`` or ``SEQ``
+    - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
     - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
     - Default values:
 
@@ -345,9 +345,9 @@ Environment variables
          - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)``
          - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``128``
-         - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``128``
-         - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
+         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128``
+         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128``
+         - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
 
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  

From c2c1e0f1137569722adda547c8b34c1889cd3230 Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Thu, 12 Sep 2024 14:41:49 +0300
Subject: [PATCH 190/341] Move setting gc threshold to separate function

---
 vllm/worker/habana_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index fdf1e9c444406..1d94bbc9dd8ab 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -538,8 +538,9 @@ def __init__(
         self.seen_configs: set = set()
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
-
+        self._set_gc_threshold()
         
+    def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
         # for comprehensive description of gc generations.
         # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority)

From 6b3503c2f16e5d8bdadbcdd84b3e1ddeeb1bce13 Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Thu, 12 Sep 2024 15:00:24 +0300
Subject: [PATCH 191/341] Fix mypy issues

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 1d94bbc9dd8ab..4949fd7aba7ad 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -547,7 +547,7 @@ def _set_gc_threshold(self) -> None:
         # to set particular generation threshold or use simpler
         # VLLM_GC_THR_MULTIPLIER to multiply default values.
         default_gc_thrs = list(gc.get_threshold())
-        requested_gc_thrs = [None] * len(default_gc_thrs)
+        requested_gc_thrs = [0] * len(default_gc_thrs)
         for i in range(len(default_gc_thrs)):
             requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
         if requested_gc_thrs == default_gc_thrs:

From 8535d53b309397f194d0f5c85cab69130b1cd083 Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Thu, 12 Sep 2024 15:03:26 +0300
Subject: [PATCH 192/341] Fix line too long

---
 vllm/worker/habana_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 4949fd7aba7ad..577ba80e6185c 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -549,7 +549,8 @@ def _set_gc_threshold(self) -> None:
         default_gc_thrs = list(gc.get_threshold())
         requested_gc_thrs = [0] * len(default_gc_thrs)
         for i in range(len(default_gc_thrs)):
-            requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
+            requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}',
+                                                       default_gc_thrs[i]))
         if requested_gc_thrs == default_gc_thrs:
             gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2))
             requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs]

From 27b618a3e889d28731cc909919b12b1c97b36244 Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Thu, 12 Sep 2024 15:08:22 +0300
Subject: [PATCH 193/341] Format files

---
 vllm/worker/habana_model_runner.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 577ba80e6185c..e61a76fa3dadf 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -539,7 +539,7 @@ def __init__(
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
         self._set_gc_threshold()
-        
+
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
         # for comprehensive description of gc generations.
@@ -549,11 +549,14 @@ def _set_gc_threshold(self) -> None:
         default_gc_thrs = list(gc.get_threshold())
         requested_gc_thrs = [0] * len(default_gc_thrs)
         for i in range(len(default_gc_thrs)):
-            requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}',
-                                                       default_gc_thrs[i]))
+            requested_gc_thrs[i] = int(
+                os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i]))
         if requested_gc_thrs == default_gc_thrs:
-            gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2))
-            requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs]
+            gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER',
+                                                   2))
+            requested_gc_thrs = [
+                t * gc_thr_multiplier for t in default_gc_thrs
+            ]
         gc.set_threshold(*requested_gc_thrs)
 
     def load_model(self) -> None:

From 35a4a984a79dc421320a2e520005e48ed884571d Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Thu, 12 Sep 2024 15:53:33 +0200
Subject: [PATCH 194/341] Remove hardcoded value from softmax in flat_pa (#280)

This PR removes the hardcoded value used to normalize softmax in flat_pa
. Current approach is to use the global maximum as it is very easy to
compute, but it has the drawback that other samples in a batch might
slightly affect numerical stability.

This is a first step to eliminated some of the INF/NaN issues we see in
certain configurations and by no means this is a complete solutions.
This needs to be revised in the future.
---
 vllm/hpu/ops.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 3d76c36f2648b..939d195a12b08 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -40,7 +40,18 @@ def block2batch(tensor, block_mapping):
 
 
 def block_softmax(batch_size, attn, block_mapping):
-    attn.sub_(10.0)
+    # We're using global maximum to decrease the exponent as
+    # it's fast to compute and performs reasonably well.
+    # This is by no means a final solution and needs to
+    # be properly addressed in the future.
+    #
+    # Additionally there's a bug where 'max' is not parallelized
+    # across TPC cores, so we need to split the tensor manually
+    # instead of simply doing attn_max = attn.max()
+
+    tail_dims = tuple(range(1, attn.dim()))
+    attn_max = attn.amax(tail_dims).amax()
+    attn.sub_(attn_max)
     attn = attn.exp_()
     sums = attn.sum(dim=-1).unsqueeze(-1)
     sums = block2batch(sums, block_mapping)

From 046cb25a4a549f985105152cb3dec2c25279252e Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 12 Sep 2024 15:23:51 +0000
Subject: [PATCH 195/341] Fix yapf detected format issue

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/model_executor/models/dbrx.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index e3a45b26d909b..71362299a9fcf 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -82,17 +82,15 @@ def __init__(
 
         self.router = DbrxRouter(config, self.params_dtype)
         self.ws = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                2 * self.intermediate_size,
-                self.d_model,
-                dtype=self.params_dtype))
+            torch.empty(self.num_total_experts,
+                        2 * self.intermediate_size,
+                        self.d_model,
+                        dtype=self.params_dtype))
         self.w2s = nn.Parameter(
-            torch.empty(
-                self.num_total_experts,
-                self.d_model,
-                self.intermediate_size,
-                dtype=self.params_dtype))
+            torch.empty(self.num_total_experts,
+                        self.d_model,
+                        self.intermediate_size,
+                        dtype=self.params_dtype))
 
         set_weight_attrs(
             self.ws,

From aa4c59cf7047c5250f8d9f6dea988d3c48bb508e Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Thu, 12 Sep 2024 15:14:39 +0000
Subject: [PATCH 196/341] some update to vision model

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/worker/habana_model_runner.py | 47 +++++++++++++++---------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index dec1b65858eb4..e690f37dd820e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -39,6 +39,8 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 
 from .profiler import Profiler
 
@@ -250,7 +252,7 @@ class PreparePromptMetadata(NamedTuple):
     lora_index_mapping: List[List[int]]
     lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Dict[str, BatchedTensors]
     slot_mapping: List[List[int]]
 
     @classmethod
@@ -264,7 +266,7 @@ def empty(cls):
             lora_index_mapping=[],
             lora_prompt_mapping=[],
             lora_requests=set(),
-            multi_modal_input=None,
+            multi_modal_kwargs=None,
             slot_mapping=[],
         )
 
@@ -452,6 +454,10 @@ def __init__(
         self._mem_margin: Optional[int] = None
         self._setup_buckets()
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
         if self.model_config.quantization == 'inc':
@@ -623,7 +629,7 @@ def _prepare_prompt(
         context_lens: List[int] = []
         query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
@@ -681,9 +687,10 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.append(list(range(context_len, seq_len)))
 
-            if seq_group_metadata.multi_modal_data:
-                multi_modal_input_list.append(
-                    seq_group_metadata.multi_modal_data.data)
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -725,15 +732,6 @@ def _prepare_prompt(
                                            dtype=torch.int,
                                            device=self.device)
 
-        if multi_modal_input_list:
-            assert self.multimodal_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
         max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
         max_prompt_len = max(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
@@ -806,6 +804,9 @@ def _prepare_prompt(
             num_decode_tokens=0,
             slot_mapping=slot_mapping,
         )
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return PreparePromptMetadata(
             input_tokens=input_tokens,
             input_positions=input_positions,
@@ -815,7 +816,7 @@ def _prepare_prompt(
             lora_index_mapping=lora_index_mapping,
             lora_prompt_mapping=lora_prompt_mapping,
             lora_requests=lora_requests,
-            multi_modal_input=multi_modal_input,
+            multi_modal_kwargs=multi_modal_kwargs,
             slot_mapping=slot_mapping,
         )
 
@@ -930,7 +931,7 @@ def prepare_input_tensors(
         input_positions = None
         lora_mapping = None
         lora_requests = None
-        multi_modal_input = None
+        multi_modal_kwargs = None
         batch_type = None
         seq_lens = None
         query_lens = None
@@ -969,7 +970,7 @@ def prepare_input_tensors(
             lora_index_mapping,
             lora_prompt_mapping,
             lora_requests,
-            multi_modal_input,
+            multi_modal_kwargs,
             slot_mapping,
         ) = self._prepare_prompt(prefill_reqs)
         (
@@ -1047,7 +1048,7 @@ def prepare_input_tensors(
             "selected_token_indices": sampling_metadata.selected_token_indices,
             "lora_requests": lora_requests,
             "lora_mapping": lora_mapping,
-            "multi_modal_input": multi_modal_input,
+            "multi_modal_kwargs": multi_modal_kwargs,
             "num_prefill_tokens": num_prefill_tokens,
             "num_decode_tokens": num_decode_tokens,
             "slot_mapping": slot_mapping,
@@ -1073,7 +1074,7 @@ def prepare_input_tensors(
             attn_metadata=attn_metadata,
             lora_requests=lora_requests,
             lora_mapping=lora_mapping,
-            multi_modal_kwargs=multi_modal_input,
+            multi_modal_kwargs=multi_modal_kwargs,
             real_batch_size=real_batch_size,
             batch_size_padded=batch_size_padded), sampling_metadata
 
@@ -1592,7 +1593,6 @@ def execute_model(
         input_positions = model_input.input_positions
         attn_metadata = model_input.attn_metadata
         sampling_metadata = model_input.sampling_metadata
-        multi_modal_input = model_input.multi_modal_kwargs
         real_batch_size = model_input.real_batch_size
         batch_size_padded = model_input.batch_size_padded
         assert input_tokens is not None
@@ -1610,10 +1610,9 @@ def execute_model(
             "positions": input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
-            "intermediate_tensors": intermediate_tensors
+            "intermediate_tensors": intermediate_tensors,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if multi_modal_input is not None:
-            execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
             execute_model_kwargs.update({
                 "bypass_hpu_graphs": not use_graphs,

From 1a35da26ce0810a7e11d1ec8a572b7b6729a8937 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 13 Sep 2024 16:36:29 +0000
Subject: [PATCH 197/341] fix ruff detected format error

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/worker/habana_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index ff20316f89de2..2abc4b6fdf38c 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -32,6 +32,8 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
@@ -43,8 +45,6 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
-                             MultiModalInputs)
 
 from .profiler import Profiler
 

From 3b710a6139dac38f18ec01753248d0b434a4e5ac Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Fri, 13 Sep 2024 16:44:37 +0000
Subject: [PATCH 198/341] fix mypy format error

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 2abc4b6fdf38c..c32ee9f92e694 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -330,7 +330,7 @@ class PreparePromptMetadata(NamedTuple):
     lora_index_mapping: List[List[int]]
     lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, BatchedTensors]
+    multi_modal_kwargs: Optional[Dict[str, BatchedTensors]]
     slot_mapping: List[List[int]]
     lora_mask: Optional[torch.Tensor]
     lora_logits_mask: Optional[torch.Tensor]

From 5abe4d7ba2c30713b0e56829b84cfaee202ee09a Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Mon, 16 Sep 2024 15:39:47 +0300
Subject: [PATCH 199/341] Move ALiBi to supported features in README_GAUDI.md

---
 README_GAUDI.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 5109f7ddf9927..644829210125c 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -81,13 +81,13 @@ Supported Features
 -   Inference with [HPU
     Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
     for accelerating low-batch latency and throughput
+-   Attention with Linear Biases (ALiBi)
 
 Unsupported Features
 ====================
 
 -   Beam search
 -   LoRA adapters
--   Attention with Linear Biases (ALiBi)
 -   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
 -   Prefill chunking (mixed-batch inferencing)
 

From 1a712d5be7127fb8b4b1e9a8d09d62dd6a38a874 Mon Sep 17 00:00:00 2001
From: kwisniewski98 <kwisniewski@habana.ai>
Date: Tue, 17 Sep 2024 12:35:35 +0300
Subject: [PATCH 200/341] Move ALiBi to supported features in
 gaudi-installation.rst

---
 docs/source/getting_started/gaudi-installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 27bc0186675f1..328f9e723ec71 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -76,13 +76,13 @@ Supported Features
 -  Tensor parallelism support for multi-card inference
 -  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
+-  Attention with Linear Biases (ALiBi)
 
 Unsupported Features
 ====================
 
 -  Beam search
 -  LoRA adapters
--  Attention with Linear Biases (ALiBi)
 -  Quantization (AWQ, FP8 E5M2, FP8 E4M3)
 -  Prefill chunking (mixed-batch inferencing)
 

From a9de5ba2385d5c332a2610a055465234905ff334 Mon Sep 17 00:00:00 2001
From: Jakub Maksymczuk <jmaksymczuk@habana.ai>
Date: Tue, 17 Sep 2024 12:31:26 +0200
Subject: [PATCH 201/341] Add fake HPU mode to Habana components with dummy
 habana_frameworks module. (#250)

Co-authored-by: Konrad Zawora <kzawora@habana.ai>
---
 .github/workflows/cpu-test.yml             | 34 ++++++++++++
 examples/offline_inference_fakehpu.py      | 38 +++++++++++++
 vllm/__init__.py                           |  4 ++
 vllm/executor/ray_habana_executor.py       | 12 +++--
 vllm/executor/ray_utils.py                 |  5 +-
 vllm/model_executor/model_loader/loader.py |  7 ++-
 vllm/model_executor/models/opt.py          |  1 -
 vllm/utils.py                              | 63 ++++++++++++++++++++++
 vllm/worker/cache_engine.py                |  4 +-
 vllm/worker/habana_model_runner.py         | 11 ++--
 vllm/worker/habana_worker.py               | 18 +++++--
 11 files changed, 177 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/cpu-test.yml
 create mode 100644 examples/offline_inference_fakehpu.py

diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
new file mode 100644
index 0000000000000..89a702f9751d9
--- /dev/null
+++ b/.github/workflows/cpu-test.yml
@@ -0,0 +1,34 @@
+name: cpu-test
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the habana_main branch
+  push:
+    branches:
+      - habana_main
+  pull_request:
+    branches:
+      - habana_main
+
+
+jobs:
+  cputest:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements-hpu.txt
+        VLLM_TARGET_DEVICE=hpu python setup.py develop
+    - name: cpu-test
+      run: |
+        VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py
diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py
new file mode 100644
index 0000000000000..972d84b60b318
--- /dev/null
+++ b/examples/offline_inference_fakehpu.py
@@ -0,0 +1,38 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0':
+    from vllm.utils import migrate_to_cpu
+    migrate_to_cpu()
+
+# Sample prompts.
+prompts = [
+    "Berlin is the capital city of ",
+    "Louvre is located in the city of ",
+    "Barack Obama was the 44th president of ",
+    "Warsaw is the capital city of ",
+    "Gniezno is a city in ",
+    "San Francisco is located in the state of ",
+    "Llanfairpwllgwyngyll is located in country of ",
+]
+ref_answers = [
+    "Germany", "Paris", "United States", "Poland", "Poland", "California",
+    "Wales"
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output, answer in zip(outputs, ref_answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert answer in generated_text, (
+        f"The generated text does not contain the correct answer: {answer}")
+print('PASSED')
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 0895c571d1d89..29fc02ae3e96a 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,8 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+from vllm.utils import is_fake_hpu, migrate_to_cpu
+
+if is_fake_hpu():
+    migrate_to_cpu()
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 17e3414a96b57..2a8e2df37f031 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -13,7 +13,7 @@
 from vllm.utils import (_run_task_with_lock,
                         error_on_invalid_device_count_status,
                         get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        get_vllm_instance_id, is_fake_hpu, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -87,18 +87,20 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         driver_ip = get_ip()
         worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("HPU", 0):
+            resource_name = "HPU" if not is_fake_hpu() else "CPU"
+            if not bundle.get(resource_name, 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
                 placement_group_bundle_index=bundle_id,
             )
-
+            resources = {'HPU': num_gpus} if not is_fake_hpu() else {}
+            num_cpus = 0 if not is_fake_hpu() else num_gpus
             worker = ray.remote(
-                num_cpus=0,
+                num_cpus=num_cpus,
                 num_gpus=0,
-                resources={'HPU': num_gpus},
+                resources=resources,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 507dc04f48123..8f5bc30a9599c 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,8 @@
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu
+from vllm.utils import (get_ip, hpu_device_string, is_hip, is_hpu, is_tpu,
+                        is_xpu)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -97,7 +98,7 @@ def initialize_ray_cluster(
     if is_tpu():
         device_str = "TPU"
     elif is_hpu():
-        device_str = "HPU"
+        device_str = hpu_device_string()
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 06048d97088e1..c49ccc96c7080 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -37,7 +37,7 @@
                                                    supports_vision)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_hpu, is_tpu
+from vllm.utils import is_fake_hpu, is_hpu, is_tpu
 
 logger = init_logger(__name__)
 
@@ -277,7 +277,10 @@ def load_model(self, *, model_config: ModelConfig,
                    scheduler_config: SchedulerConfig,
                    cache_config: CacheConfig) -> nn.Module:
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(self.load_config.device):
+            _device = torch.device(
+                device_config.device) if is_fake_hpu() else torch.device(
+                    self.load_config.device)
+            with _device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, multimodal_config,
                                           cache_config, scheduler_config)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index a05090cd46648..3f842ea757d2f 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -254,7 +254,6 @@ def forward(
         if self.project_in is not None:
             inputs_embeds, _ = self.project_in(inputs_embeds)
         hidden_states = inputs_embeds + pos_embeds
-
         for i in range(len(self.layers)):
             layer = self.layers[i]
             hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
diff --git a/vllm/utils.py b/vllm/utils.py
index fa6e132dd3522..04782cf13fce5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -208,10 +208,41 @@ def is_neuron() -> bool:
 
 @lru_cache(maxsize=None)
 def is_hpu() -> bool:
+    return _is_habana_frameworks_installed() or _is_built_for_hpu()
+
+
+@lru_cache(maxsize=None)
+def is_fake_hpu() -> bool:
+    return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'
+
+
+@lru_cache(maxsize=None)
+def hpu_device_string():
+    device_string = 'hpu' if not is_fake_hpu() else 'cpu'
+    return device_string
+
+
+@lru_cache(maxsize=None)
+def hpu_backend_string():
+    backend_string = 'hccl' if not is_fake_hpu() else 'gloo'
+    return backend_string
+
+
+@lru_cache(maxsize=None)
+def _is_habana_frameworks_installed() -> bool:
     from importlib import util
     return util.find_spec('habana_frameworks') is not None
 
 
+@lru_cache(maxsize=None)
+def _is_built_for_hpu() -> bool:
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        return "gaudi" in version("vllm")
+    except PackageNotFoundError:
+        return False
+
+
 @lru_cache(maxsize=None)
 def is_tpu() -> bool:
     try:
@@ -624,18 +655,24 @@ def __init__(self, device=None):
 
     @staticmethod
     def current_device_memory_usage() -> float:
+        if is_fake_hpu():
+            return 0
         # Return the device memory usage in bytes.
         free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory - free_hpu_memory
 
     @staticmethod
     def current_free_device_memory() -> float:
+        if is_fake_hpu():
+            return 0
         # Return the device memory usage in bytes.
         free_hpu_memory, _ = torch.hpu.mem_get_info()
         return free_hpu_memory
 
     @staticmethod
     def total_device_memory() -> float:
+        if is_fake_hpu():
+            return 0
         # Return the device memory usage in bytes.
         _, total_hpu_memory = torch.hpu.mem_get_info()
         return total_hpu_memory
@@ -1088,3 +1125,29 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
     """Utility function to run async task in a lock"""
     async with lock:
         return await task(*args, **kwargs)
+
+
+def migrate_to_cpu():
+    import importlib
+    from unittest.mock import MagicMock
+
+    torch.hpu = MagicMock(name="torch.hpu")
+
+    # Adding dummy submodules to habana_frameworks.torch for cpu-test,
+    # functions from dummy modules will do nothing by default
+    spec = importlib.util.spec_from_loader('habana_frameworks', loader=None)
+    sys.modules['habana_frameworks'] = MagicMock()
+    sys.modules['habana_frameworks'].__spec__ = spec
+
+    builtin_import = __builtins__['__import__']  # type: ignore
+
+    def import_wrapper(name, *args, **kwargs):
+        if 'habana_frameworks' in name:
+            sys.modules[name] = MagicMock()
+        return builtin_import(name, *args, **kwargs)
+
+    __builtins__['__import__'] = import_wrapper
+
+    # In case you want to mock a function to actually do something
+    import habana_frameworks.torch as htorch
+    htorch.utils.internal.is_lazy.return_value = False
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index ec0b8c2369210..f678d44f71dd3 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu,
                         is_pin_memory_available)
 
 logger = init_logger(__name__)
@@ -78,7 +78,7 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
         for _ in range(self.num_attention_layers):
-            if device == 'hpu':
+            if device == 'hpu' or is_fake_hpu():
                 key_cache = torch.zeros(kv_cache_shape,
                                         dtype=self.dtype,
                                         device=device)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6c157fd43fffd..171ae0510d6c6 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -37,7 +37,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (HabanaMemoryProfiler, format_bytes,
+from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu,
                         is_pin_memory_available, make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
@@ -246,7 +246,8 @@ def __init__(self, model, block_size, dtype, enforce_eager):
                                                '0').lower() in ['1', 'true']
         self.block_size = block_size
         self.dtype = dtype
-        if not htorch.utils.internal.is_lazy() and not enforce_eager:
+        if not is_fake_hpu() and not htorch.utils.internal.is_lazy(
+        ) and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
                                        dynamic=False)
@@ -509,7 +510,9 @@ def __init__(
                                if model_config is not None else None)
         self.device_config = (device_config
                               if device_config is not None else DeviceConfig())
-
+        if is_fake_hpu():
+            device_config.device = torch.device('cpu')
+            device_config.device_type = 'cpu'
         self.device = self.device_config.device
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
@@ -618,7 +621,7 @@ def load_model(self) -> None:
                                           mark_only_scales_as_const=True)
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())
-            else:
+            elif not is_fake_hpu():
                 self.model = self.model.to("hpu")
                 htcore.mark_step()
             torch.hpu.synchronize()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 9d083915041fe..b4f6e53c1745a 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -21,7 +21,8 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import HabanaMemoryProfiler, format_bytes
+from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string,
+                        hpu_device_string, is_fake_hpu)
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
 from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
@@ -105,6 +106,8 @@ def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
             self.device = torch.device("hpu")
             torch.hpu.set_device(self.device)
+        elif self.device_config.device_type == "cpu":
+            self.device = torch.device("cpu")
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -138,6 +141,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
+        if is_fake_hpu():
+            cache_block_size = self.get_cache_block_size_bytes()
+            fake_hpu_cache_alloc = 4 * 2**30  # take 4 GiB flat on fake hpu
+            return fake_hpu_cache_alloc // cache_block_size, 0
         with HabanaMemoryProfiler() as m:
             self.model_runner.profile_run()
             torch.hpu.synchronize()
@@ -335,11 +342,12 @@ def init_worker_distributed_environment(
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
+    backend = hpu_backend_string()
     init_distributed_environment(parallel_config.world_size,
                                  rank,
                                  distributed_init_method,
                                  local_rank,
-                                 backend='hccl')
+                                 backend=backend)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
@@ -356,15 +364,17 @@ def init_worker_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
+        backend = hpu_backend_string()
         torch.distributed.init_process_group(
-            backend="hccl",
+            backend=backend,
             world_size=parallel_config.world_size,
             rank=rank,
             init_method=distributed_init_method,
         )
 
     # A small all_reduce for warmup & checking conformance.
-    dummy_tensor_hpu = torch.ones(1).to('hpu')
+    device = hpu_device_string()
+    dummy_tensor_hpu = torch.ones(1).to(device)
     torch.distributed.all_reduce(dummy_tensor_hpu)
     assert dummy_tensor_hpu.item() == parallel_config.world_size
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,

From d39298c1289a7dcc4d95b08bcd7ad90e9fbf12e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= <mkuligowski@habana.ai>
Date: Tue, 17 Sep 2024 15:46:51 +0200
Subject: [PATCH 202/341] Update documentation on support of fp8  (#288)

Update documentation on support of fp8
---
 README_GAUDI.md                                    | 3 ++-
 docs/source/getting_started/gaudi-installation.rst | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 5109f7ddf9927..2ae9d5f2cc6e4 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -81,6 +81,7 @@ Supported Features
 -   Inference with [HPU
     Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
     for accelerating low-batch latency and throughput
+-   INC quantization
 
 Unsupported Features
 ====================
@@ -88,7 +89,7 @@ Unsupported Features
 -   Beam search
 -   LoRA adapters
 -   Attention with Linear Biases (ALiBi)
--   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-   AWQ quantization
 -   Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 27bc0186675f1..2d810380af59b 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -76,6 +76,7 @@ Supported Features
 -  Tensor parallelism support for multi-card inference
 -  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
+-  INC quantization
 
 Unsupported Features
 ====================
@@ -83,7 +84,7 @@ Unsupported Features
 -  Beam search
 -  LoRA adapters
 -  Attention with Linear Biases (ALiBi)
--  Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-  AWQ quantization
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations

From ed19acd8a0065410b9172d1fa31b92e348100bf9 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 17 Sep 2024 17:06:07 +0300
Subject: [PATCH 203/341] Reduce default value of VLLM_GRAPH_RESERVED_MEM to
 0.1

---
 README_GAUDI.md                                    | 6 +++---
 docs/source/getting_started/gaudi-installation.rst | 4 ++--
 vllm/worker/habana_worker.py                       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 5109f7ddf9927..9e289658fd5c2 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -315,9 +315,9 @@ mark 90% of free device memory at that point as usable. Next, KV cache
 gets allocated, model is warmed up, and HPU Graphs are captured.
 Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
 memory reserved for HPU Graphs capture. With its default value
-(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved
+(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved
 for graph capture (later referred to as \"usable graph memory\"), and
-the remaining 60% will be utilized for KV cache. Environment variable
+the remaining 90% will be utilized for KV cache. Environment variable
 `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
 reserved for prefill and decode graphs. By default
 (`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
@@ -445,7 +445,7 @@ Environment variables
 -   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
     default
 -   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
-    HPUGraph capture, `0.4` by default
+    HPUGraph capture, `0.1` by default
 -   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
     dedicated for prompt graphs, `0.5` by default
 -   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 27bc0186675f1..5af81210c4159 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -243,7 +243,7 @@ Before KV cache gets allocated, model weights are loaded onto the device, and a
 Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
 Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
 Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. 
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
 Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
 
@@ -322,7 +322,7 @@ Environment variables
 **Performance tuning knobs:**
 
 -   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
--   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default
+-   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
 -   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
 -   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
 -   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 9d083915041fe..291a7fc0d4489 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -150,7 +150,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         cache_block_size = self.get_cache_block_size_bytes()
         graph_reserved_mem = (float(
-            os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4'))
+            os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1'))
                               if not self.model_config.enforce_eager else 0)
         graph_headroom = 1 - graph_reserved_mem
         available_hpu_memory = free_hpu_memory * \

From 6a96d9bd9180437b04133d3c023daaa174d8d516 Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Date: Tue, 17 Sep 2024 16:14:18 +0200
Subject: [PATCH 204/341] Removed vllm.hpu directory and changed relevant
 imports (#291)

Moved files from vllm/hpu to another public repo:
https://github.com/HabanaAI/vllm-hpu-extension
It can be installed with pip install
git+https://github.com/HabanaAI/vllm-hpu-extension.git
---
 .github/workflows/mypy.yaml                   |   1 -
 format.sh                                     |   1 -
 requirements-hpu.txt                          |   1 +
 tests/lora/test_lora_hpu.py                   |   2 +-
 vllm/attention/backends/habana_attn.py        |   6 +-
 vllm/attention/ops/habana_paged_attn.py       |   3 +-
 vllm/hpu/__init__.py                          |   6 -
 vllm/hpu/cache_ops.py                         | 107 -------
 vllm/hpu/ops.py                               | 293 ------------------
 vllm/hpu/rotary_embed.py                      | 123 --------
 vllm/hpu/utils.py                             |  61 ----
 vllm/lora/layers.py                           |   3 +-
 vllm/model_executor/layers/fused_moe/layer.py |   2 +-
 vllm/model_executor/layers/layernorm.py       |   2 +-
 .../model_executor/layers/rotary_embedding.py |   2 +-
 vllm/worker/habana_model_runner.py            |   2 +-
 16 files changed, 12 insertions(+), 603 deletions(-)
 delete mode 100644 vllm/hpu/__init__.py
 delete mode 100644 vllm/hpu/cache_ops.py
 delete mode 100644 vllm/hpu/ops.py
 delete mode 100644 vllm/hpu/rotary_embed.py
 delete mode 100644 vllm/hpu/utils.py

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index c2674b914f485..9858d00cfb5c1 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -50,6 +50,5 @@ jobs:
         mypy vllm/transformers_utils --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/hpu --config-file pyproject.toml
 
 
diff --git a/format.sh b/format.sh
index fbfc27a68bb3d..5ad6d6f2938bb 100755
--- a/format.sh
+++ b/format.sh
@@ -113,7 +113,6 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
-mypy vllm/hpu --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index e0f03c8464c7b..d451200aa1144 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,3 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index 01b6472745e1c..c8ecaef1a6316 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+from vllm_hpu_extension.ops import LoraMask
 
-from vllm.hpu.ops import LoraMask
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
 
 from .utils import DummyLoRAManager
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 56b71a431aca7..b7b8072de3fe5 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -7,14 +7,14 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension import cache_ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
-import vllm.hpu.ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
-from vllm.hpu import cache_ops
-from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index cab8d7abe95fd..49a3e3f774d58 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -6,8 +6,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
-
-from vllm.hpu import cache_ops, ops
+from vllm_hpu_extension import cache_ops, ops
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
deleted file mode 100644
index b8e4d3aac98a7..0000000000000
--- a/vllm/hpu/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
deleted file mode 100644
index 9042924f68b3d..0000000000000
--- a/vllm/hpu/cache_ops.py
+++ /dev/null
@@ -1,107 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-import math
-
-import habana_frameworks.torch as htorch
-import torch
-
-
-def reshape_and_cache(key,
-                      value,
-                      key_cache,
-                      value_cache,
-                      slot_mapping,
-                      dtype,
-                      is_prompt=False):
-    num_blocks = key_cache.size(0)
-    block_size = key_cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        key_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            key[start_idx:end_idx])
-        value_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            value[start_idx:end_idx])
-
-
-def prepare_to_cache(cache, slot_mapping):
-    num_blocks = cache.size(0)
-    block_size = cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
-
-    return num_kv_cache_passes, num_slots_available, indices, offsets
-
-
-def insert_or_update_cache(input, cache, num_kv_cache_passes,
-                           num_slots_available, block_indices, block_offsets):
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        cache.index_put_((block_indices[start_idx:end_idx],
-                          block_offsets[start_idx:end_idx]),
-                         input[start_idx:end_idx])
-
-
-def swap_blocks(src, dst, block_mapping):
-    index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
-    index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
-    for src_idx, dst_idx in block_mapping.items():
-        index_src[0] = src_idx
-        index_dst[0] = dst_idx
-        dst.index_put_([index_dst], src.index_select(0, index_src))
-    if dst.device.type == 'hpu':
-        htorch.core.mark_step()
-        torch.hpu.synchronize()
-
-
-def copy_blocks(key_caches, value_caches, block_mapping):
-    index_src = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    index_dst = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    for src, dsts in block_mapping.items():
-        index_src[0] = src
-        for dst in dsts:
-            index_dst[0] = dst
-            for key_cache in key_caches:
-                key_cache.index_copy_(0, index_dst,
-                                      key_cache.index_select(0, index_src))
-            for value_cache in value_caches:
-                value_cache.index_copy_(0, index_dst,
-                                        value_cache.index_select(0, index_src))
-        if key_caches[0].device.type == 'hpu':
-            htorch.core.mark_step()
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
deleted file mode 100644
index 939d195a12b08..0000000000000
--- a/vllm/hpu/ops.py
+++ /dev/null
@@ -1,293 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-from typing import Optional
-
-import habana_frameworks.torch as htorch
-import torch
-import torch.nn.functional as F
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-HPUFusedRMSNorm = None
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
-    HPUFusedRMSNorm = FusedRMSNorm
-except ImportError:
-    logger.warning("Could not import HPU FusedRMSNorm kernel. "
-                   "vLLM will use forward_native implementation of RMSNorm.")
-HPUFusedSDPA = None
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-    HPUFusedSDPA = FusedSDPA
-except ImportError:
-    logger.warning("Could not import HPU FusedSDPA kernel. "
-                   "vLLM will use native implementation.")
-
-
-def batch2block(tensor, block_mapping):
-    shape = tuple(tensor.shape)
-    return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
-
-
-def block2batch(tensor, block_mapping):
-    shape = tuple(tensor.shape)
-    return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
-
-
-def block_softmax(batch_size, attn, block_mapping):
-    # We're using global maximum to decrease the exponent as
-    # it's fast to compute and performs reasonably well.
-    # This is by no means a final solution and needs to
-    # be properly addressed in the future.
-    #
-    # Additionally there's a bug where 'max' is not parallelized
-    # across TPC cores, so we need to split the tensor manually
-    # instead of simply doing attn_max = attn.max()
-
-    tail_dims = tuple(range(1, attn.dim()))
-    attn_max = attn.amax(tail_dims).amax()
-    attn.sub_(attn_max)
-    attn = attn.exp_()
-    sums = attn.sum(dim=-1).unsqueeze(-1)
-    sums = block2batch(sums, block_mapping)
-    sums = batch2block(sums, block_mapping)
-    sums.add_(1.0e-12)
-    attn.div_(sums)
-    return attn
-
-
-def flat_pa(query, key_cache, value_cache, block_list, block_mapping,
-            block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func,
-            values_fetch_func):
-    batch_size = query.size(0)
-    q_heads = query.size(1)
-    kv_heads = key_cache.size(2)
-
-    query = batch2block(scale * query, block_mapping).unsqueeze(-2)
-    key = keys_fetch_func(key_cache, block_list).transpose(1, 2)
-    value = values_fetch_func(value_cache, block_list).transpose(1, 2)
-    block_bias = block_bias.view(key.size(0), 1, 1, -1)
-
-    if kv_heads != q_heads:
-        block_bias = block_bias.unsqueeze(1)
-        query = query.unflatten(1, (kv_heads, -1))
-        key = key.unflatten(1, (kv_heads, 1))
-        value = value.unflatten(1, (kv_heads, 1))
-        key = key.transpose(3, 4)
-    else:
-        key = key.transpose(2, 3)
-
-    attn = matmul_qk_op(query, key) + block_bias
-    attn = block_softmax(batch_size, attn, block_mapping)
-    attn = matmul_av_op(attn, value)
-    attn = block2batch(attn, block_mapping)
-    attn = attn.squeeze(-2)
-    if kv_heads != q_heads:
-        attn = attn.flatten(1, 2)
-    return attn
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-#TODO: remove after fusedsdpa fix for query_head != kv_head
-def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-    The kv go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = kv.shape
-    if n_rep == 1:
-        return kv
-    kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                     head_dim)
-    return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def prompt_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_bias: Optional[torch.Tensor] = None,
-    p: float = 0.0,
-    scale: Optional[float] = None,
-    matmul_qk_op=torch.matmul,
-    softmax_op=torch.softmax,
-    matmul_av_op=torch.matmul,
-    valid_seq_lengths: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-    query_heads = query.size(1)
-    kv_heads = key.size(1)
-    if attn_bias is not None or HPUFusedSDPA is None:
-        if query_heads != kv_heads:
-            query = query.unflatten(1, (kv_heads, -1))
-            key = key.unflatten(1, (kv_heads, 1))
-            value = value.unflatten(1, (kv_heads, 1))
-            if attn_bias is not None:
-                attn_bias = attn_bias.unsqueeze(2)
-        attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2))
-        if attn_bias is not None:
-            attn_weights.add_(attn_bias)
-        attn_weights = softmax_op(attn_weights, dim=-1)
-        attn_weights = matmul_av_op(attn_weights, value)
-        if query_heads != kv_heads:
-            attn_weights = attn_weights.flatten(1, 2)
-    else:
-        #TODO: remove after fusedsdpa fix for query_heads != kv_heads
-        if query_heads != kv_heads:
-            key = repeat_kv(key, int(query_heads // kv_heads))
-            value = repeat_kv(value, int(query_heads // kv_heads))
-        softmax_mode = 'fast'
-        recompute_mode = True
-        attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True,
-                                       scale, softmax_mode, recompute_mode,
-                                       valid_seq_lengths, 'right')
-    attn_weights = attn_weights.transpose(1, 2)
-    return attn_weights
-
-
-class LoraMask:
-    lora_mask = None
-
-    @staticmethod
-    def setLoraMask(mask):
-        LoraMask.lora_mask = mask
-
-    @staticmethod
-    def getLoraMask():
-        return LoraMask.lora_mask
-
-
-def dispatch_bgmv_linear(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices
-    stacked at dimension 0 into single tensors, assuming same rank. `wa` is the
-    reshaped and transposed version of `wa_t_all` of shape
-    (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped
-    version of `wb_t_all` of shape (max_loras * lora_rank, h_out).
-
-    Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of
-    inactive LoRA indices. Matmul masked output with `wb` and scale it to get
-    the final output.
-    """
-
-    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    mask = LoraMask.getLoraMask()
-
-    wa = wa_t_all[:, 0, :, :]
-    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-    wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1)
-    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
-
-    out = x @ wa
-    assert (out.shape == mask.shape)
-    out = out * mask
-    out = out @ wb
-    y += out * scale
-
-
-def dispatch_bgmv_embedding(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into
-    a single tensor, assuming same rank. `wb` is the transposed and reshaped
-    version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim).
-
-    Output of LoRA-A embedding (tensor x) is repeated max_loras times to match
-    the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive
-    LoRA indices. Matmul masked output with `wb` and scale it to get the final
-    output.
-    """
-
-    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    max_loras = wb_t_all.size(0)
-
-    x = x.repeat(1, max_loras)
-    x = x * LoraMask.getLoraMask()
-    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
-    out = x @ wb
-    y += out * scale
-
-
-class MoeMatmul(torch.nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def set_weight(self, w):
-        self.weight = w
-
-    def calc(self, state, expert_id, w):
-        self.weight = w[expert_id].transpose(0, 1)
-        return self.forward(state)
-
-    def forward(self, state):
-        return torch.matmul(state, self.weight)
-
-
-class StaticFusedMOE(torch.nn.Module):
-
-    def __init__(self, num_total_experts):
-        super().__init__()
-        self.w13_list = torch.nn.ModuleList(
-            [MoeMatmul() for _ in range(num_total_experts)])
-        self.w2_list = torch.nn.ModuleList(
-            [MoeMatmul() for _ in range(num_total_experts)])
-        self.num_total_experts = num_total_experts
-
-    def forward(self, hidden_states, w1, w2, score, topk):
-        B, D = hidden_states.shape
-        routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       topk,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros((1, B, D),
-                                          dtype=hidden_states.dtype,
-                                          device=hidden_states.device)
-        padded_weights = torch.zeros((B, self.num_total_experts),
-                                     dtype=hidden_states.dtype,
-                                     device=hidden_states.device)
-        padded_weights.scatter_(-1, selected_experts, routing_weights)
-        padded_weights = padded_weights.reshape(-1, B, self.num_total_experts)
-        padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-        htorch.core.mark_step()
-
-        for expert_idx in range(self.num_total_experts):
-            padded_weight = padded_weights[expert_idx]
-            current_state_static = hidden_states.reshape(-1, D)
-            w_output = self.w13_list[expert_idx].calc(current_state_static,
-                                                      expert_idx, w1)
-            w_output = silu_and_mul(w_output)
-            w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2)
-            current_hidden_states_static = w_output * padded_weight
-            final_hidden_states += current_hidden_states_static
-
-        return final_hidden_states.view(-1, D)
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
deleted file mode 100644
index 1857253f47f1b..0000000000000
--- a/vllm/hpu/rotary_embed.py
+++ /dev/null
@@ -1,123 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-import torch
-import torch.nn as nn
-
-from vllm.logger import init_logger
-from vllm.utils import is_hpu
-
-logger = init_logger(__name__)
-
-if is_hpu():
-    try:
-        from habana_frameworks.torch.hpex.kernels import (
-            RotaryPosEmbeddingHelperV1 as FusedRoPE)
-    except ImportError:
-        logger.warning("Could not import HPU FusedRoPE kernel. "
-                       "vLLM will use forward_native implementation of RoPE.")
-        FusedRoPE = None
-else:
-    FusedRoPE = None
-
-
-class HpuRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 head_size,
-                 rotary_dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 is_neox_style=None,
-                 device='hpu',
-                 RoPEFallback=None):
-        super().__init__()
-
-        self.head_size = head_size
-        self.dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-        if FusedRoPE is None:
-            assert RoPEFallback is not None, (
-                "HPU FusedRoPE kernel could not be imported, and "
-                "fallback RoPE implementation was not provided!")
-            self.fallback_impl = RoPEFallback(head_size,
-                                              rotary_dim,
-                                              max_position_embeddings,
-                                              base,
-                                              is_neox_style,
-                                              dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached",
-                             emb.cos().to(dtype),
-                             persistent=False)
-        self.register_buffer("sin_cached",
-                             emb.sin().to(dtype),
-                             persistent=False)
-
-    def forward(self, positions: torch.Tensor, query: torch.Tensor,
-                key: torch.Tensor):
-        if FusedRoPE is None:
-            return self.fallback_impl(positions, query, key)
-        if query.dim() == 2:
-            query = query.unsqueeze(0)
-        if key.dim() == 2:
-            key = key.unsqueeze(0)
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-        seq_len = key.shape[-2]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len,
-                                    device=query.device,
-                                    dtype=query.dtype)
-
-        cos, sin = self.cos_cached[:seq_len].to(
-            dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
-        query = query.reshape(
-            (query.shape[0], query.shape[1], query.shape[2] // self.head_size,
-             self.head_size))
-        key = key.reshape((key.shape[0], key.shape[1],
-                           key.shape[2] // self.head_size, self.head_size))
-        query_rot = query[..., :self.dim]
-        key_rot = key[..., :self.dim]
-        if self.dim < self.head_size:
-            query_pass = query[..., self.dim:]
-            key_pass = key[..., self.dim:]
-
-        if len(positions[0]) == 1:
-            cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-            sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-        else:
-            cos = cos[positions].unsqueeze(2)
-            sin = sin[positions].unsqueeze(2)
-        query, key = FusedRoPE.apply(query_rot, cos, sin,
-                                     0), FusedRoPE.apply(key_rot, cos, sin, 0)
-        if self.dim < self.head_size:
-            query = torch.cat((query, query_pass), dim=-1)
-            key = torch.cat((key, key_pass), dim=-1)
-        return query.reshape(
-            (query.shape[0], query.shape[1],
-             query.shape[2] * query.shape[3])), key.reshape(
-                 (key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
deleted file mode 100644
index 13204b83d5742..0000000000000
--- a/vllm/hpu/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-from functools import wraps
-
-import habana_frameworks.torch as htorch
-import torch
-
-from vllm.hpu.cache_ops import insert_or_update_cache
-
-
-def with_mark_steps(fn):
-
-    @wraps(fn)
-    def wrapped(*args, **kwargs):
-        htorch.core.mark_step()
-        result = fn(*args, **kwargs)
-        del args
-        del kwargs
-        htorch.core.mark_step()
-        return result
-
-    return wrapped
-
-
-class Matmul(torch.nn.Module):
-
-    def __init__(self):
-        super(Matmul, self).__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class Softmax(torch.nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, dim=None, inv_head=None):
-        return torch.softmax(x, dim)
-
-
-class VLLMKVCache(torch.nn.Module):
-
-    def __init__(self):
-        super(VLLMKVCache, self).__init__()
-
-    def forward(self, input, cache, num_kv_cache_passes, num_slots_available,
-                block_indices, block_offset):
-        insert_or_update_cache(input, cache, num_kv_cache_passes,
-                               num_slots_available, block_indices,
-                               block_offset)
-        return cache
-
-    def fetch_from_cache(self, cache, blocks):
-        return cache.index_select(0, blocks)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index aa01e9fb77af2..59b7432b6e6eb 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -30,7 +30,8 @@
 from vllm.utils import is_hpu
 
 if is_hpu():
-    from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear
+    from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                        dispatch_bgmv_linear)
 
 if TYPE_CHECKING:
     pass
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cf0d5f98f1b01..bda8a0622ef31 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -202,7 +202,7 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         if is_hpu():
-            from vllm.hpu.ops import StaticFusedMOE
+            from vllm_hpu_extension.ops import StaticFusedMOE
             self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
 
         if quant_config is None:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index c12668c14887d..9ef532e61a7c0 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -75,7 +75,7 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm.hpu.ops import HPUFusedRMSNorm
+        from vllm_hpu_extension.ops import HPUFusedRMSNorm
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 4e3c840bede60..2581e3a74dc72 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -31,7 +31,7 @@
 from vllm.utils import is_hpu, is_tpu
 
 if is_hpu():
-    from vllm.hpu.rotary_embed import HpuRotaryEmbedding
+    from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 171ae0510d6c6..d1d4e783dfe80 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -19,13 +19,13 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
+from vllm_hpu_extension.ops import LoraMask as LoraMask
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
-from vllm.hpu.ops import LoraMask as LoraMask
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest

From 18d633972d43444cdf9130edb9f960aa34f7fb8f Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Tue, 17 Sep 2024 21:43:05 +0000
Subject: [PATCH 205/341] fix minor logging issue

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d1d4e783dfe80..d465d883898cd 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -92,7 +92,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults):
     values = [
         int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)
     ]
-    for e, v, d in zip(env_vars, values, defaults):
+    for e, v, d in zip(env_vars, values, default_values):
         logger.info('%s=%s (default:%s)', e, v, d)
     return values
 

From b62fba85ac03326e9f466d8d37e91ae1b14a6511 Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Wed, 18 Sep 2024 12:09:13 +0200
Subject: [PATCH 206/341] Fix blocks number calculation for Flat PA (#269)

Fix blocks number calculation for Flat PA via adding empty table_block
(https://github.com/HabanaAI/vllm-fork/issues/158)
---
 vllm/worker/habana_model_runner.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d465d883898cd..73156ad6aea5b 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -173,11 +173,16 @@ def generate_prompt_buckets(bs_bucket_config,
 def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
                             max_blocks):
     buckets = []
-    for bs in warmup_range(bs_bucket_config):
-        for blocks in warmup_range(blocks_bucket_config):
+    bs_buckets = warmup_range(bs_bucket_config)
+    block_buckets = warmup_range(blocks_bucket_config)
+    bmin, bstep, bmax = blocks_bucket_config
+    last_bucket = max_blocks if (max_blocks // bstep
+                                 == 0) else (max_blocks // bstep + 1) * bstep
+    for bs in bs_buckets:
+        for blocks in block_buckets:
             if blocks < bs:
                 continue
-            if blocks > max_blocks:
+            if blocks > last_bucket:
                 break
             buckets.append((bs, blocks))
     return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
@@ -964,10 +969,12 @@ def _prepare_decode(
                 seq_lens.append(seq_len)
 
                 block_table = seq_group_metadata.block_tables[seq_id]
-                block_number = block_table[position // self.block_size]
-                if block_number == _PAD_BLOCK_ID:
+                if len(block_table) == 0:
+                    block_number = _PAD_BLOCK_ID
+                    block_table = []
                     slot = next(dummy_slots)
                 else:
+                    block_number = block_table[position // self.block_size]
                     block_offset = position % self.block_size
                     slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
@@ -992,7 +999,7 @@ def _prepare_decode(
 
         num_decode_tokens = sum(seq_lens)
 
-        blocks_used = [len(bt) for bt in block_tables]
+        blocks_used = [len(bt) for bt in block_tables if bt]
         block_list = list(itertools.chain(*block_tables))
         block_mapping_nested: List[List[int]] = [
             [i] * b_u for i, b_u in enumerate(blocks_used)
@@ -1080,8 +1087,9 @@ def prepare_input_tensors(
         batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
         batch_size_padding = batch_size_padded - real_batch_size
         seq_group_metadata_list = seq_group_metadata_list.copy()
-        seq_group_metadata_list.extend(seq_group_metadata_list[0]
-                                       for _ in range(batch_size_padding))
+        seq_group_metadata_list.extend(
+            self.create_dummy_seq_group_metadata(0, 0, is_prompt)
+            for _ in range(batch_size_padding))
 
         prefill_reqs = []
         decode_reqs = []

From cd7b1c15a3e1a07bf38a9f29acaafc437024be4b Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Fri, 20 Sep 2024 09:13:06 +0200
Subject: [PATCH 207/341] Remove dummy seq group data creation from loop (#301)

Remove dummy seq metadata from loop for Flat PA fix
---
 vllm/worker/habana_model_runner.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 73156ad6aea5b..0d5df1f312ec9 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1087,9 +1087,11 @@ def prepare_input_tensors(
         batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
         batch_size_padding = batch_size_padded - real_batch_size
         seq_group_metadata_list = seq_group_metadata_list.copy()
-        seq_group_metadata_list.extend(
-            self.create_dummy_seq_group_metadata(0, 0, is_prompt)
-            for _ in range(batch_size_padding))
+        if batch_size_padding > 0:
+            dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
+                0, 0, is_prompt)
+            seq_group_metadata_list.extend(dummy_seq_group_metadata
+                                           for _ in range(batch_size_padding))
 
         prefill_reqs = []
         decode_reqs = []

From 12d7033e768677e78b62b051f2dbe2ab8b994c77 Mon Sep 17 00:00:00 2001
From: Bob Zhu <41610754+czhu15@users.noreply.github.com>
Date: Fri, 20 Sep 2024 15:31:57 +0800
Subject: [PATCH 208/341] optimize qwen2 model on Gaudi (#233)

Add extra mark_step() on each decode layer to optimize the performance
on Gaudi.

Signed-off-by: Bob Zhu <bob.zhu@intel.com>
---
 vllm/model_executor/models/qwen2.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3deb3d8840cc4..1e4f62fcce7d6 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -47,6 +47,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import is_hpu
 
 from .interfaces import SupportsLoRA
 
@@ -260,6 +261,9 @@ def forward(
         else:
             hidden_states = self.embed_tokens(input_ids)
         residual = None
+        if is_hpu():
+            import habana_frameworks.torch as htorch
+            htorch.core.mark_step()
         for i in range(len(self.layers)):
             layer = self.layers[i]
             hidden_states, residual = layer(
@@ -269,6 +273,9 @@ def forward(
                 attn_metadata,
                 residual,
             )
+            if is_hpu():
+                htorch.core.mark_step()
+
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 

From bc39baa482dcfefeae6289e80cea63b4adc9beeb Mon Sep 17 00:00:00 2001
From: hlin99 <73271530+hlin99@users.noreply.github.com>
Date: Fri, 20 Sep 2024 16:19:05 +0800
Subject: [PATCH 209/341] fix bug: device_str in initialize_ray_cluster
 requires uppercase string (#297)

fix bug: device_str in initialize_ray_cluster requires uppercase string

w/o the bug fix, multi HPUs will encounter "ValueError: The number of
required hpus exceeds the total number of available hpus in the
placement group" error, as the device_str is not expected as uppercase,
then available hpus always returns 0.
---
 vllm/executor/ray_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 8f5bc30a9599c..ea81c313f2da9 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -98,7 +98,7 @@ def initialize_ray_cluster(
     if is_tpu():
         device_str = "TPU"
     elif is_hpu():
-        device_str = hpu_device_string()
+        device_str = hpu_device_string().upper()
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:

From b2653ab884da92a67da8c66b612a4dd33ac9efb2 Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:38:36 +0530
Subject: [PATCH 210/341] Fix Lora Rebase (#290)

Fixes Lora Related issues in vllm Rebase
---
 tests/lora/test_lora_hpu.py        | 108 +++++++++++++++++++---------
 vllm/hpu/ops.py                    |   2 -
 vllm/hpu/punica_hpu.py             |  77 ++++++++++++++++++++
 vllm/lora/layers.py                | 109 ++---------------------------
 vllm/lora/models.py                |  25 ++-----
 vllm/lora/punica.py                |   9 +--
 vllm/utils.py                      |   5 ++
 vllm/worker/habana_model_runner.py |  10 +--
 8 files changed, 181 insertions(+), 164 deletions(-)
 create mode 100644 vllm/hpu/punica_hpu.py

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index ddbab66e166b3..57bc19b2170db 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -1,7 +1,8 @@
 import pytest
 import torch
+from vllm.hpu.ops import LoraMask
 
-from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
+from vllm.hpu.punica_hpu import GaudiPunicaWrapper
 
 from .utils import DummyLoRAManager
 
@@ -19,7 +20,19 @@
     torch.float16: (5e-3, 5e-3),
     torch.bfloat16: (3e-2, 2e-2),
 }
-MAX_LORAS = 8
+
+
+def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank,
+                   lora_dtype):
+    indices = indices.view(-1, 1)
+    mask = torch.arange(max_loras * max_lora_rank, device=indices.device)
+    mask = mask.view(1, -1)
+    mask = ((mask >= ((indices) * max_lora_rank)) *
+            (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype)
+    mask = mask.view(batch_size, 1,
+                     -1).expand(batch_size, seq_len,
+                                -1).reshape(batch_size * seq_len, -1)
+    return mask
 
 
 @pytest.mark.parametrize("m", TENSOR_SIZES)
@@ -39,32 +52,41 @@ def test_apply_lora(m, n, k, rank, dtype) -> None:
     input = torch.rand(k, n, device="hpu", dtype=dtype)
     expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
 
-    lora_a_stack = torch.zeros(MAX_LORAS + 1,
+    lora_a_stack = torch.zeros(8,
                                1,
                                lora.lora_a.shape[1],
                                lora.lora_a.shape[0],
                                device="hpu",
                                dtype=dtype)
-    lora_b_stack = torch.zeros(MAX_LORAS + 1,
+    lora_b_stack = torch.zeros(8,
                                1,
                                lora.lora_b.shape[1],
                                lora.lora_b.shape[0],
                                device="hpu",
                                dtype=dtype)
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stack.shape[0]):
         lora_a_stack[i][0] = lora.lora_a.T
         lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
 
     output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"),
-                output)
+    indices = torch.randint(0,
+                            lora_a_stack.shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
+
+    punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0)
+
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora(input, lora_a_stack, lora_b_stack,
-                torch.full((len(input), ), -1, device="hpu"), output)
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0)
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -99,7 +121,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
                          dim=1)
 
     lora_a_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_1.lora_a.shape[1],
                     lora_1.lora_a.shape[0],
@@ -107,32 +129,40 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
                     dtype=dtype) for i in range(2)
     ]
     lora_b_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_1.lora_b.shape[1],
                     lora_1.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype) for i in range(2)
     ]
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stacks[0].shape[0]):
         lora_a_stacks[0][i][0] = lora_1.lora_a.T
         lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
         lora_a_stacks[1][i][0] = lora_2.lora_a.T
         lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
 
     output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
-        (m // 2, m // 2))
+    indices = torch.randint(0,
+                            lora_a_stacks[0].shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
+    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
+                                          lora_b_stacks, 1.0, (m // 2, m // 2))
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="hpu"),
-                              output, (m // 2, m // 2))
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
+                                          lora_b_stacks, 1.0, (m // 2, m // 2))
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
@@ -166,14 +196,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
                          dim=1)
 
     lora_a_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_q.lora_a.shape[1],
                     lora_q.lora_a.shape[0],
                     device="hpu",
                     dtype=dtype)
     ] + [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_k.lora_a.shape[1],
                     lora_k.lora_a.shape[0],
@@ -181,21 +211,21 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
                     dtype=dtype) for i in range(2)
     ]
     lora_b_stacks = [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_q.lora_b.shape[1],
                     lora_q.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype)
     ] + [
-        torch.zeros(MAX_LORAS + 1,
+        torch.zeros(8,
                     1,
                     lora_k.lora_b.shape[1],
                     lora_k.lora_b.shape[0],
                     device="hpu",
                     dtype=dtype) for i in range(2)
     ]
-    for i in range(MAX_LORAS):
+    for i in range(lora_a_stacks[0].shape[0]):
         lora_a_stacks[0][i][0] = lora_q.lora_a.T
         lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
         lora_a_stacks[1][i][0] = lora_k.lora_a.T
@@ -204,18 +234,30 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
         lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
 
     output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype)
-    _apply_lora_packed_nslice(
-        input, lora_a_stacks, lora_b_stacks,
-        torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output,
-        (qkv[0], qkv[1], qkv[2]))
+    indices = torch.randint(0,
+                            lora_a_stacks[0].shape[0], (len(input), ),
+                            device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
+    punica_wrapper.add_lora_packed_nslice(output, input,
+                                          lora_a_stacks,
+                                          lora_b_stacks, 
+                                          1.0, (qkv[0], qkv[1], qkv[2]))
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
 
     output[:] = 0
-    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
-                              torch.full((len(input), ), -1, device="hpu"),
-                              output, (qkv[0], qkv[1], qkv[2]))
+    indices = torch.full((len(input), ), -1, device="hpu")
+    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
+    LoraMask.setLoraMask(mask)
+
+    punica_wrapper.add_lora_packed_nslice(output, input,
+                                          lora_a_stacks,
+                                          lora_b_stacks, 
+                                          1.0, (qkv[0], qkv[1], qkv[2]))
     assert torch.allclose(torch.zeros_like(output), output)
 
-    manager.reset_lora()
+    manager.reset_lora()
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index b2705429906c4..aaf863aff0cad 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -193,7 +193,6 @@ def dispatch_bgmv_linear(
     x: torch.Tensor,
     wa_t_all: torch.Tensor,
     wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
     layer_idx: int,
     scale: float,
 ):
@@ -228,7 +227,6 @@ def dispatch_bgmv_embedding(
     y: torch.Tensor,
     x: torch.Tensor,
     wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
     layer_idx: int,
     scale: float,
 ):
diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py
new file mode 100644
index 0000000000000..aed015ac4ae06
--- /dev/null
+++ b/vllm/hpu/punica_hpu.py
@@ -0,0 +1,77 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+
+import torch
+from vllm.lora.punica import PunicaWrapper
+from vllm.hpu.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding
+
+class GaudiPunicaWrapper(PunicaWrapper):
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: str):
+        super().__init__(max_num_batched_tokens, max_batches, device)
+
+    def add_lora(self,
+                 y: torch.Tensor,
+                 x: torch.Tensor,
+                 wa_t_all: torch.Tensor,
+                 wb_t_all: torch.Tensor,
+                 scale: float,
+                 y_offset: Optional[int] = None,
+                 y_slice_size: Optional[int] = None,
+                 *,
+                 buffer: Optional[torch.Tensor] = None) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
+        y = y.view_as(y_org)
+
+    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                               lora_a_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               lora_b_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               scale: float,
+                               output_slices: Tuple[int, ...]) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]],
+                x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        wa_t_all: torch.Tensor,
+                        wb_t_all: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0)
\ No newline at end of file
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9e4a0098dc44e..e6be20edc8ce6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -29,8 +29,6 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
-if current_platform.is_hpu():
-    from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear
 
 if TYPE_CHECKING:
     pass
@@ -67,87 +65,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def _apply_lora(
-    x: torch.Tensor,
-    lora_a_stacked: torch.Tensor,
-    lora_b_stacked: torch.Tensor,
-    indices: torch.Tensor,
-    output: torch.Tensor,
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    Input shapes:
-        x:               (batch_size, hidden_dim)
-        lora_a_stacked:  (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:  (num_loras, output_dim, lora_rank)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    if current_platform.is_hpu():
-        dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked,
-                             indices, 0, 1.0)
-    else:
-        add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0)
-    return output.view_as(org_output)
-
-
-def _apply_lora_packed_nslice(
-    x: torch.Tensor,
-    lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-):
-    """Applies lora to each input.
-
-    This method applies all loras to each input. It uses the
-    indices vector to determine which lora yields the
-    correct output. An index of -1 means no lora should be
-    applied. This method adds the final lora results to the
-    output.
-
-    This method is used for layers that are composed of multiple sublayers
-    (slices) packed together.
-
-    Input shapes:
-        x:                 (batch_size, hidden_dim)
-        lora_a_stacked:    3 element tuple of (num_loras, lora_rank, hidden_dim)
-        lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    x = x.view(-1, x.shape[-1])
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-    offset_left = 0
-    for slice_idx in range(len(output_slices)):
-        if is_hpu():
-            dispatch_bgmv_linear(
-                output[:, offset_left:offset_left + output_slices[slice_idx]],
-                x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx],
-                indices, 0, 1.0)
-        else:
-            add_lora_slice(output, x, lora_a_stacked[slice_idx],
-                           lora_b_stacked[slice_idx], indices, 0, 1.0,
-                           offset_left, output_slices[slice_idx])
-        offset_left += output_slices[slice_idx]
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
     is_prefill: bool = False
@@ -309,22 +226,7 @@ def set_lora(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
         embeddings_indices = None
-        if current_platform.is_hpu():
-            embedding_len = self.indices_len[3]
-            # NOTE(vgoel): These asserts can be skipped when upstreaming.
-            # Can be removed from vllm-fork also once lora functionality
-            # on Gaudi stabilizes.
-            if current_platform.is_hpu():
-                emb_len = embedding_len
-                x_shape = x.shape
-                ind_shape = self.embeddings_indices[1].shape
-                assert embedding_len == x.shape[0] * x.shape[1], \
-                    f"Extra Info: {emb_len}, {x_shape}, {ind_shape}"
-                assert embedding_len <= self.embeddings_indices[1].shape[0], \
-                    f"Extra Info: {emb_len}, {x.shape}, {ind_shape}"
-            indices = self.embeddings_indices[1][:embedding_len].view_as(x)
-        else:
-            embeddings_indices = self.punica_wrapper.embeddings_indices
+        embeddings_indices = self.punica_wrapper.embeddings_indices
         indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,
@@ -342,12 +244,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
                 full_lora_a_embeddings.shape[1], -1)
+        # Embedding layer only need expand op
         if current_platform.is_hpu():
-            dispatch_bgmv_embedding(full_output, full_lora_a_embeddings,
-                                    self.lora_b_stacked,
-                                    self.indices[:self.indices_len[0]], 0, 1.0)
+            self.punica_wrapper.add_lora_embedding(full_output,
+                                                   full_lora_a_embeddings,
+                                                   self.lora_b_stacked,
+                                                    add_input=True)
         else:
-            # Embedding layer only need expand op
             self.punica_wrapper.add_expand(full_output,
                                            full_lora_a_embeddings,
                                            self.lora_b_stacked,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e3abf0fc96196..c29660eb3bda5 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -26,9 +26,12 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.model_executor.models.utils import PPMissingLayer
-from vllm.utils import is_pin_memory_available
+from vllm.utils import is_pin_memory_available, get_device
 from vllm.platforms import current_platform
 
+if current_platform.is_hpu():
+    from vllm.hpu.punica_hpu import GaudiPunicaWrapper
+
 logger = init_logger(__name__)
 
 _GLOBAL_LORA_ID = 0
@@ -428,23 +431,9 @@ def __init__(
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         if current_platform.is_hpu():
-            self.base_indices = torch.empty(self.max_num_batched_tokens,
-                                            dtype=torch.long,
-                                            device=get_device())
-            self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                               dtype=torch.long,
-                                               device=get_device())
-            self.sampler_indices_padded = torch.empty(
-                self.max_num_batched_tokens,
-                dtype=torch.long,
-                device=get_device())
-            self.embeddings_indices = torch.empty(2,
-                                                  self.max_num_batched_tokens,
-                                                  dtype=torch.long,
-                                                  device=get_device())
-            self.long_lora_indices = torch.empty(self.max_num_batched_tokens,
-                                                 dtype=torch.long,
-                                                 device=get_device())
+            self.punica_wrapper = GaudiPunicaWrapper(max_num_batched_tokens,
+                                            max_batches=self.max_num_seqs,
+                                            device="hpu")
         else:
             self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
                                                 max_batches=self.max_num_seqs,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 6d5c834299961..d9c074b6144a1 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -10,6 +10,7 @@
 import torch
 
 from vllm.triton_utils import HAS_TRITON
+from vllm.utils import get_device
 
 if HAS_TRITON:
     from vllm.lora.ops.bgmv_expand import bgmv_expand
@@ -104,7 +105,7 @@ def convert_mapping(
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
+                                        device=get_device(),
                                         dtype=torch.long)
     prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
@@ -131,9 +132,9 @@ def convert_mapping(
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    indices = torch.tensor(indices_list, dtype=torch.long, device=get_device())
     prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
+                                         device=get_device(),
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
@@ -145,7 +146,7 @@ def convert_mapping(
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+        0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + (
             sampler_indices_padded * len(sampler_indices_padded))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
diff --git a/vllm/utils.py b/vllm/utils.py
index 6409a2de7b142..ed565d3244541 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -741,6 +741,11 @@ def is_hpu() -> bool:
     from importlib import util
     return util.find_spec('habana_frameworks') is not None
 
+def get_device() -> str:
+    if is_hpu():
+        return "hpu"
+    return "cuda"
+
 
 class HabanaMemoryProfiler:
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index aefe3508fecb5..5336ad3ed4da9 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -242,11 +242,12 @@ def pad_list(list, k, v):
 
 class HpuModelAdapter():
 
-    def __init__(self, model, block_size, enforce_eager):
+    def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
         self.block_size = block_size
+        self.dtype = dtype
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
@@ -308,7 +309,7 @@ def forward(self, *args, **kwargs):
         input_ids = kwargs['input_ids']
         kwargs['attn_metadata'] = self._update_metadata(
             kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
-            input_ids.device, torch.bfloat16)
+            input_ids.device, self.dtype)
         LoraMask.setLoraMask(kwargs.pop('lora_mask'))
         hidden_states = self.model(*args, **kwargs)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
@@ -607,6 +608,7 @@ def load_model(self) -> None:
                 self.model = _maybe_wrap_in_hpu_graph(
                     self.model,
                     self.block_size,
+                    dtype=self.model_config.dtype,
                     enforce_eager=self.enforce_eager)
             msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}"
             logger.info(msg)
@@ -1844,8 +1846,8 @@ def execute_model(
             modules = unwrap_model(self.model.model)
             for module in modules:
                 if isinstance(module, VocabParallelEmbeddingWithLoRA):
-                    for i in range(0, len(module.indices_len)):
-                        module.indices_len[
+                    for i in range(0, len(module.punica_wrapper.indices_len)):
+                        module.punica_wrapper.indices_len[
                             i] = sampling_metadata.selected_token_indices.numel(
                             )
             lora_logits_mask: torch.Tensor = model_input.lora_logits_mask

From 9f8b8e72e9fcb6b7a8cce40e147bbfef57d05883 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 13:26:20 +0300
Subject: [PATCH 211/341] add missing files

---
 vllm/engine/arg_utils.py    | 1 +
 vllm/platforms/interface.py | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 837332da7efd7..f8b544c6bde4d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -35,6 +35,7 @@
     "openvino",
     "tpu",
     "xpu",
+    "hpu",
 ]
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 31bf5268c1f19..fea4358953745 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -42,13 +42,10 @@ def is_rocm(self) -> bool:
     def is_tpu(self) -> bool:
         return self._enum == PlatformEnum.TPU
 
-<<<<<<< HEAD
     def is_hpu(self) -> bool:
         return self._enum == PlatformEnum.HPU
 
     @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
-=======
     def is_cpu(self) -> bool:
         return self._enum == PlatformEnum.CPU
 
@@ -89,7 +86,6 @@ def has_device_capability(
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
->>>>>>> upstream/main
         raise NotImplementedError
 
     @classmethod

From 346139dd6ca1db44e6a7b8f649306fbf8800a5a7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:05:35 +0300
Subject: [PATCH 212/341] format.sh

---
 format.sh                                  | 29 +++++++++-------------
 tests/samplers/test_sampler.py             |  1 +
 vllm/hpu/punica_hpu.py                     |  8 +++---
 vllm/lora/layers.py                        |  2 +-
 vllm/lora/models.py                        |  7 +++---
 vllm/lora/punica.py                        |  5 ++--
 vllm/model_executor/model_loader/loader.py |  1 +
 vllm/platforms/hpu.py                      |  7 +++---
 vllm/platforms/interface.py                |  1 -
 vllm/utils.py                              |  6 +----
 vllm/worker/habana_model_runner.py         |  2 +-
 vllm/worker/model_runner_base.py           |  2 +-
 12 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/format.sh b/format.sh
index adaed1a51c343..6563d89b192ea 100755
--- a/format.sh
+++ b/format.sh
@@ -96,23 +96,18 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy tests --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
-mypy vllm/attention --config-file pyproject.toml
-mypy vllm/core --config-file pyproject.toml
-mypy vllm/distributed --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
-mypy vllm/entrypoints --config-file pyproject.toml
-mypy vllm/executor --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/prompt_adapter --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/worker --config-file pyproject.toml
+mypy --follow-imports skip  # Note that this is less strict than CI
+mypy tests --follow-imports skip
+mypy vllm/attention --follow-imports skip
+mypy vllm/distributed --follow-imports skip
+mypy vllm/engine  --follow-imports skip
+mypy vllm/executor --follow-imports skip
+mypy vllm/lora --follow-imports skip
+mypy vllm/model_executor  --follow-imports skip
+mypy vllm/prompt_adapter --follow-imports skip
+mypy vllm/spec_decode --follow-imports skip
+mypy vllm/worker --follow-imports skip
+echo 'vLLM mypy: Done'
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 3cb46dbc213d9..65bd7b09acdc3 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -782,6 +782,7 @@ def test_sampler_include_gpu_probs_tensor(device: str):
     assert sampler_output.logprobs is not None
     assert sampler_output.sampled_token_ids is not None
 
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_topk_topk_scalar():
     obj1 = ApplyToppTopkScalar(2)
diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py
index 3c37558831bb5..8f732a98c3e29 100644
--- a/vllm/hpu/punica_hpu.py
+++ b/vllm/hpu/punica_hpu.py
@@ -11,7 +11,9 @@
 from vllm.lora.punica import PunicaWrapper
 from vllm_hpu_extension.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding
 
+
 class GaudiPunicaWrapper(PunicaWrapper):
+
     def __init__(self, max_num_batched_tokens: int, max_batches: int,
                  device: str):
         super().__init__(max_num_batched_tokens, max_batches, device)
@@ -48,8 +50,8 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
 
         for slice_idx in range(len(output_slices)):
             dispatch_bgmv_linear(
-                y[:, offset_left:offset_left + output_slices[slice_idx]],
-                x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0)
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0)
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
 
@@ -74,4 +76,4 @@ def add_lora_embedding(
         w_t_all: torch.Tensor,
         add_input: bool = True,
     ):
-        dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0)
\ No newline at end of file
+        dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index b6e7e6783a328..461f4d435d67d 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -253,7 +253,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.punica_wrapper.add_lora_embedding(full_output,
                                                    full_lora_a_embeddings,
                                                    self.lora_b_stacked,
-                                                    add_input=True)
+                                                   add_input=True)
         else:
             self.punica_wrapper.add_expand(full_output,
                                            full_lora_a_embeddings,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index c29660eb3bda5..6d6fd05c55e93 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -431,9 +431,10 @@ def __init__(
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         if current_platform.is_hpu():
-            self.punica_wrapper = GaudiPunicaWrapper(max_num_batched_tokens,
-                                            max_batches=self.max_num_seqs,
-                                            device="hpu")
+            self.punica_wrapper = GaudiPunicaWrapper(
+                max_num_batched_tokens,
+                max_batches=self.max_num_seqs,
+                device="hpu")
         else:
             self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
                                                 max_batches=self.max_num_seqs,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index d9c074b6144a1..5a2f02ee91456 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -146,8 +146,9 @@ def convert_mapping(
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + (
-            sampler_indices_padded * len(sampler_indices_padded))
+        0, len(sampler_indices_padded), device=get_device(),
+        dtype=torch.long) + (sampler_indices_padded *
+                             len(sampler_indices_padded))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2d51ed6e50bf2..d8f0f68f1c02e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -87,6 +87,7 @@ def device_loading_context(module: torch.nn.Module,
                     p.data = p.data.to(original_device)
         # New parameters or parameters already on target device are untouched
 
+
 from vllm.utils import is_fake_hpu
 
 logger = init_logger(__name__)
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 45f2b95e704d6..ceb3934f29342 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,15 +1,16 @@
-from typing import Tuple
+from typing import Optional, Tuple
 
 import torch
 
-from .interface import Platform, PlatformEnum
+from .interface import DeviceCapability, Platform, PlatformEnum
 
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
 
     @staticmethod
-    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    def get_device_capability(
+            device_id: int = 0) -> Optional[DeviceCapability]:
         raise RuntimeError("HPU does not have device capability.")
 
     @staticmethod
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index fea4358953745..3b00a9fd98da5 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -45,7 +45,6 @@ def is_tpu(self) -> bool:
     def is_hpu(self) -> bool:
         return self._enum == PlatformEnum.HPU
 
-    @staticmethod
     def is_cpu(self) -> bool:
         return self._enum == PlatformEnum.CPU
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 80bbed7cac1a5..377c933b466a6 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -796,11 +796,6 @@ def format_bytes(size):
     return f'{size:.4g} {power_labels[n]+"B"}'
 
 
-@lru_cache(maxsize=None)
-def is_hpu() -> bool:
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
 def get_device() -> str:
     if is_hpu():
         return "hpu"
@@ -1425,6 +1420,7 @@ def dec(self, num=1):
     def value(self):
         return self._value
 
+
 def migrate_to_cpu():
     import importlib
     from unittest.mock import MagicMock
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e8bf5dfb34628..d92b34c92ea29 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1096,7 +1096,7 @@ def prepare_input_tensors(
         if batch_size_padding > 0:
             dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
                 0, 0, is_prompt)
-            seq_group_metadata_list.extend(dummy_seq_group_metadata
+            seq_group_metadata_list.extend(seq_group_metadata_list[0]
                                            for _ in range(batch_size_padding))
 
         prefill_reqs = []
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 9013e9d251cb6..89613c91ac543 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from functools import wraps
 from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
-                    Optional, Type, TypeVar,  Union, get_args, get_origin)
+                    Optional, Type, TypeVar, Union, get_args, get_origin)
 
 import torch
 from torch import is_tensor

From 6d4544343c0f6121750931ca1da79954c33f6524 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:19:29 +0300
Subject: [PATCH 213/341] more format.sh

---
 vllm/engine/arg_utils.py                      |  6 +-
 vllm/executor/habana_executor.py              |  2 +-
 vllm/executor/ray_habana_executor.py          |  4 +-
 vllm/executor/ray_utils.py                    |  2 +-
 vllm/hpu/punica_hpu.py                        |  6 +-
 vllm/lora/layers.py                           |  6 +-
 vllm/lora/models.py                           |  2 +-
 vllm/model_executor/layers/fused_moe/layer.py | 58 -------------------
 vllm/model_executor/layers/sampler.py         |  2 +-
 vllm/model_executor/model_loader/loader.py    |  4 +-
 vllm/model_executor/models/gpt_bigcode.py     |  2 +-
 vllm/model_executor/models/mixtral.py         |  2 +-
 vllm/model_executor/models/qwen2.py           |  3 +-
 vllm/model_executor/sampling_metadata.py      |  1 -
 vllm/platforms/hpu.py                         |  2 +-
 vllm/worker/habana_model_runner.py            | 12 ++--
 vllm/worker/habana_worker.py                  |  4 +-
 17 files changed, 28 insertions(+), 90 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f8b544c6bde4d..2b1667023a1fa 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -16,9 +16,9 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import FlexibleArgumentParser
-from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -1023,8 +1023,8 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device if self.weights_load_device is None else \
-                 self.weights_load_device
+        load_device = device_config.device if self.weights_load_device is \
+            None else self.weights_load_device
         load_config = self.create_load_config(load_device)
 
         prompt_adapter_config = PromptAdapterConfig(
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 6b362e6f1e326..6e92da0245836 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -9,9 +9,9 @@
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
                         get_ip, get_open_port, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index f0822283296dd..66b6e76e92004 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -9,11 +9,11 @@
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
-from vllm.sequence import ExecuteModelRequest
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                         get_ip, get_open_port, get_vllm_instance_id,
-                        make_async, is_fake_hpu)
+                        is_fake_hpu, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 34b002514c27a..8971f5aac626e 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -10,7 +10,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import get_ip, is_hip, is_xpu, hpu_device_string
+from vllm.utils import get_ip, hpu_device_string, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py
index 8f732a98c3e29..9b7261564e629 100644
--- a/vllm/hpu/punica_hpu.py
+++ b/vllm/hpu/punica_hpu.py
@@ -5,11 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
 
-from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
 from vllm.lora.punica import PunicaWrapper
-from vllm_hpu_extension.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding
 
 
 class GaudiPunicaWrapper(PunicaWrapper):
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 461f4d435d67d..cc55d4afc7d6f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -17,6 +17,7 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
+from vllm.hpu.punica_hpu import GaudiPunicaWrapper
 from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -30,10 +31,6 @@
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
-                                        dispatch_bgmv_linear)
-
 if TYPE_CHECKING:
     pass
 
@@ -250,6 +247,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1], -1)
         # Embedding layer only need expand op
         if current_platform.is_hpu():
+            assert isinstance(self.punica_wrapper, GaudiPunicaWrapper)
             self.punica_wrapper.add_lora_embedding(full_output,
                                                    full_lora_a_embeddings,
                                                    self.lora_b_stacked,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 6d6fd05c55e93..4ec7a6815a755 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -26,8 +26,8 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.model_executor.models.utils import PPMissingLayer
-from vllm.utils import is_pin_memory_available, get_device
 from vllm.platforms import current_platform
+from vllm.utils import is_pin_memory_available
 
 if current_platform.is_hpu():
     from vllm.hpu.punica_hpu import GaudiPunicaWrapper
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 9e4e7233c1eba..179a8609a17f4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -579,61 +579,3 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
             else:
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
-
-    def forward(self, hidden_states: torch.Tensor,
-                router_logits: torch.Tensor):
-        assert self.quant_method is not None
-
-        # Matrix multiply.
-        final_hidden_states = self.quant_method.apply(
-            self,
-            x=hidden_states,
-            router_logits=router_logits,
-            top_k=self.top_k,
-            renormalize=self.renormalize,
-            use_grouped_topk=self.use_grouped_topk,
-            num_expert_group=self.num_expert_group,
-            topk_group=self.topk_group)
-
-        if self.reduce_results and self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states
-
-    @classmethod
-    def make_expert_params_mapping(
-            cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
-            ckpt_up_proj_name: str,
-            num_experts: int) -> List[Tuple[str, str, int, int]]:
-
-        gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
-        gate_down_up = [
-            ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
-        ]
-
-        return [
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_scale"
-             if weight_name in gate_up else "experts.w2_scale",
-             f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
-             shard_id) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the weights for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.w13_weight"
-             if weight_name in gate_up else "experts.w2_weight",
-             f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
-            for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ] + [
-            # These are the weight scales for the experts
-            # (param_name, weight_name, expert_id, shard_id)
-            ("experts.a13_scale"
-             if weight_name in gate_up else "experts.a2_scale",
-             f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
-             shard_id) for expert_id in range(num_experts)
-            for shard_id, weight_name in enumerate(gate_down_up)
-        ]
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 9dda63f9768c6..6da6199a01962 100755
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,9 +1,9 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+import math
 import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
-import math
 from math import inf
 from typing import Dict, List, Optional, Tuple, Union
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d8f0f68f1c02e..b03e6aca48c0e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -44,7 +44,7 @@
                                                    supports_multimodal)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_pin_memory_available
+from vllm.utils import is_fake_hpu, is_pin_memory_available
 
 
 @contextmanager
@@ -88,8 +88,6 @@ def device_loading_context(module: torch.nn.Module,
         # New parameters or parameters already on target device are untouched
 
 
-from vllm.utils import is_fake_hpu
-
 logger = init_logger(__name__)
 
 
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index f01cace91d2ab..a8567f32958be 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -39,8 +39,8 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
 from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a5ef454ee80e0..7a075162d579f 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -45,8 +45,8 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
 from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 649caba5d9424..9801b218ddb83 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -46,8 +46,9 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
 from vllm.platform import current_platform
+from vllm.sequence import IntermediateTensors
+
 from .interfaces import SupportsLoRA
 from .utils import is_pp_missing_parameter, make_layers
 
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 829e3fd6d8eb5..97d36d31f2b11 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -9,7 +9,6 @@
                            SequenceGroupMetadata)
 from vllm.utils import (PyObjectCache, async_tensor_h2d,
                         is_pin_memory_available, make_tensor_with_pad)
-from vllm.platforms import current_platform
 
 _SAMPLING_EPS = 1e-5
 
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index ceb3934f29342..feddce69ac5b4 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,4 +1,4 @@
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d92b34c92ea29..04674e505b01f 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -2,7 +2,6 @@
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-from array import array
 import collections
 import contextlib
 import dataclasses
@@ -13,6 +12,7 @@
 import operator
 import os
 import time
+from array import array
 from enum import IntEnum
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
                     Optional, Set, Tuple, Type, TypeVar, Union)
@@ -24,23 +24,21 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ObservabilityConfig,
-                         ParallelConfig, PromptAdapterConfig, SchedulerConfig)
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
-from vllm.inputs.registry import InputRegistry
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal.registry import MultiModalRegistry
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu,
                         is_pin_memory_available, make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
@@ -1096,7 +1094,7 @@ def prepare_input_tensors(
         if batch_size_padding > 0:
             dummy_seq_group_metadata = self.create_dummy_seq_group_metadata(
                 0, 0, is_prompt)
-            seq_group_metadata_list.extend(seq_group_metadata_list[0]
+            seq_group_metadata_list.extend(dummy_seq_group_metadata
                                            for _ in range(batch_size_padding))
 
         prefill_reqs = []
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 89c796068bac4..f2678c5e405dc 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -11,8 +11,8 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ObservabilityConfig,
-                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)

From 3a0ff3b8dfeebc0b891a3e87ffd9fdf72268782e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:23:03 +0300
Subject: [PATCH 214/341] gha update

---
 .github/workflows/mypy.yaml | 30 +++++++++++-------------------
 .github/workflows/yapf.yml  |  1 +
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 42c141237fb15..6ebe512c5dbf6 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,23 +32,15 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy tests --config-file pyproject.toml
-        mypy vllm/*.py --config-file pyproject.toml
-        mypy vllm/attention --config-file pyproject.toml
-        mypy vllm/core --config-file pyproject.toml
-        mypy vllm/distributed --config-file pyproject.toml
-        mypy vllm/engine  --config-file pyproject.toml
-        mypy vllm/entrypoints --config-file pyproject.toml
-        mypy vllm/executor --config-file pyproject.toml
-        mypy vllm/inputs --config-file pyproject.toml
-        mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/lora --config-file pyproject.toml
-        mypy vllm/model_executor  --config-file pyproject.toml
-        mypy vllm/multimodal --config-file pyproject.toml
-        mypy vllm/platforms --config-file pyproject.toml
-        mypy vllm/spec_decode --config-file pyproject.toml
-        mypy vllm/transformers_utils --config-file pyproject.toml
-        mypy vllm/usage --config-file pyproject.toml
-        mypy vllm/worker --config-file pyproject.toml
-
+        mypy
+        mypy tests --follow-imports skip
+        mypy vllm/attention --follow-imports skip
+        mypy vllm/distributed --follow-imports skip
+        mypy vllm/engine  --follow-imports skip
+        mypy vllm/executor --follow-imports skip
+        mypy vllm/lora --follow-imports skip
+        mypy vllm/model_executor  --follow-imports skip
+        mypy vllm/prompt_adapter --follow-imports skip
+        mypy vllm/spec_decode --follow-imports skip
+        mypy vllm/worker --follow-imports skip
 
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 448c52a3b49dc..b1002578610d4 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -9,6 +9,7 @@ on:
   pull_request:
     branches:
       - habana_main
+
 jobs:
   yapf:
     runs-on: ubuntu-latest

From 6502b9150d695eda96deec497b54a6751965ac7a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:47:35 +0300
Subject: [PATCH 215/341] Separate LoRA algorithms

---
 requirements-hpu.txt   |  2 +-
 vllm/hpu/punica_hpu.py | 81 ------------------------------------------
 vllm/lora/layers.py    |  4 ++-
 vllm/lora/models.py    |  2 +-
 4 files changed, 5 insertions(+), 84 deletions(-)
 delete mode 100644 vllm/hpu/punica_hpu.py

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index d451200aa1144..56caa4ba03862 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,4 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b
diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py
deleted file mode 100644
index 9b7261564e629..0000000000000
--- a/vllm/hpu/punica_hpu.py
+++ /dev/null
@@ -1,81 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-from typing import Optional, Tuple
-
-import torch
-from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
-                                    dispatch_bgmv_linear)
-
-from vllm.lora.punica import PunicaWrapper
-
-
-class GaudiPunicaWrapper(PunicaWrapper):
-
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: str):
-        super().__init__(max_num_batched_tokens, max_batches, device)
-
-    def add_lora(self,
-                 y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 scale: float,
-                 y_offset: Optional[int] = None,
-                 y_slice_size: Optional[int] = None,
-                 *,
-                 buffer: Optional[torch.Tensor] = None) -> None:
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
-        y = y.view_as(y_org)
-
-    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                               lora_a_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               lora_b_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               scale: float,
-                               output_slices: Tuple[int, ...]) -> None:
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-
-        for slice_idx in range(len(output_slices)):
-            dispatch_bgmv_linear(
-                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
-                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0)
-            offset_left += output_slices[slice_idx]
-        y = y.view_as(y_org)
-
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        wa_t_all: torch.Tensor,
-                        wb_t_all: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None) -> None:
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
-        y = y.view_as(y_org)
-
-    def add_lora_embedding(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        add_input: bool = True,
-    ):
-        dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index cc55d4afc7d6f..b3758ad883d56 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -17,7 +17,6 @@
                               tensor_model_parallel_all_reduce,
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
-from vllm.hpu.punica_hpu import GaudiPunicaWrapper
 from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -31,6 +30,9 @@
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
 
+if current_platform.is_hpu():
+    from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper
+
 if TYPE_CHECKING:
     pass
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 4ec7a6815a755..546a4c402aedc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -30,7 +30,7 @@
 from vllm.utils import is_pin_memory_available
 
 if current_platform.is_hpu():
-    from vllm.hpu.punica_hpu import GaudiPunicaWrapper
+    from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper
 
 logger = init_logger(__name__)
 

From 7057da5f76e465e4735490787b543cf3d2b5ad3b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:54:18 +0300
Subject: [PATCH 216/341] yapf is being a headache

---
 tests/lora/test_lora_hpu.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index 1e0e728ae7240..06b687282391b 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -254,10 +254,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
     mask = createLoraMask(indices, k, 1, 8, rank, dtype)
     LoraMask.setLoraMask(mask)
 
-    punica_wrapper.add_lora_packed_nslice(output, input,
-                                          lora_a_stacks,
-                                          lora_b_stacks, 
-                                          1.0, (qkv[0], qkv[1], qkv[2]))
+    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
+                                          lora_b_stacks, 1.0,
+                                          (qkv[0], qkv[1], qkv[2]))
     assert torch.allclose(torch.zeros_like(output), output)
 
     manager.reset_lora()
\ No newline at end of file

From 43df76205de572c7c1cd86321ceb72f1f9759633 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 17:00:33 +0300
Subject: [PATCH 217/341] oh come on now

---
 tests/lora/test_lora_hpu.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index 06b687282391b..a59cfe875ef9c 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -241,10 +241,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
     LoraMask.setLoraMask(mask)
 
     punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
-    punica_wrapper.add_lora_packed_nslice(output, input,
-                                          lora_a_stacks,
-                                          lora_b_stacks, 
-                                          1.0, (qkv[0], qkv[1], qkv[2]))
+    qkvs = (qkv[0], qkv[1], qkv[2])
+    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
+                                          lora_b_stacks, 1.0, qkvs)
 
     rtol, atol = TOLERANCES[dtype]
     assert torch.allclose(expected, output, rtol=rtol, atol=atol)
@@ -253,10 +252,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
     indices = torch.full((len(input), ), -1, device="hpu")
     mask = createLoraMask(indices, k, 1, 8, rank, dtype)
     LoraMask.setLoraMask(mask)
-
+    qkvs = (qkv[0], qkv[1], qkv[2])
     punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
-                                          lora_b_stacks, 1.0,
-                                          (qkv[0], qkv[1], qkv[2]))
+                                          lora_b_stacks, 1.0, qkvs)
     assert torch.allclose(torch.zeros_like(output), output)
 
-    manager.reset_lora()
\ No newline at end of file
+    manager.reset_lora()

From 3134b8a0f40753534c211da2bbc4cca0413904fe Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 17:18:32 +0300
Subject: [PATCH 218/341] fix fakehpu mode

---
 vllm/platforms/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index b717e256e43f5..e3b7dd3bb216e 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -44,8 +44,11 @@
 
 is_hpu = False
 try:
+    import os
     from importlib import util
-    is_hpu = util.find_spec('habana_frameworks') is not None
+    is_hpu = util.find_spec('habana_frameworks') is not None or os.environ.get(
+        'VLLM_USE_FAKE_HPU', '0') != '0'
+
 except Exception:
     pass
 

From f92ffc15d2268149beb90bbb1e19f539d77928ad Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Mon, 23 Sep 2024 09:27:18 +0200
Subject: [PATCH 219/341] Fix calculating slots for warmup (#310)

Recent changes broke slot sparsity for warmup slots. This commit
restores the functionality.
---
 vllm/worker/habana_model_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 0d5df1f312ec9..1d8566e0edff4 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -971,10 +971,11 @@ def _prepare_decode(
                 block_table = seq_group_metadata.block_tables[seq_id]
                 if len(block_table) == 0:
                     block_number = _PAD_BLOCK_ID
-                    block_table = []
-                    slot = next(dummy_slots)
                 else:
                     block_number = block_table[position // self.block_size]
+                if block_number == _PAD_BLOCK_ID:
+                    slot = next(dummy_slots)
+                else:
                     block_offset = position % self.block_size
                     slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])

From 63fae5106748b9ec86bec436ce275d878c5dfe02 Mon Sep 17 00:00:00 2001
From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com>
Date: Mon, 23 Sep 2024 09:27:33 +0200
Subject: [PATCH 220/341] Removed padding block from a list of available blocks
 in allocators (#313)

Block 0 is used for padding. This PR removes the padding block from a
list of available blocks in block allocators v1 and v2
---
 vllm/core/block/cpu_gpu_block_allocator.py | 8 ++++++--
 vllm/core/block_manager_v1.py              | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 5287cd9c1bfb3..6ade639fc3ade 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,7 +4,7 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device
+from vllm.utils import Device, is_hpu
 
 
 class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -52,7 +52,11 @@ def create(
             - The block IDs are assigned contiguously, with GPU block IDs coming
                 before CPU block IDs.
         """
-        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if is_hpu() else 0
+        block_ids = list(
+            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
         gpu_block_ids = block_ids[:num_gpu_blocks]
         cpu_block_ids = block_ids[num_gpu_blocks:]
 
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e29eba375f4dd..2a3cbe2e642cb 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -13,7 +13,7 @@
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device
+from vllm.utils import Device, is_hpu
 
 logger = init_logger(__name__)
 
@@ -171,7 +171,9 @@ def __init__(
 
         # Initialize the free blocks.
         self.free_blocks: BlockTable = []
-        for i in range(num_blocks):
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if is_hpu() else 0
+        for i in range(reserved_blocks, num_blocks):
             block = PhysicalTokenBlock(device=device,
                                        block_number=i,
                                        block_size=block_size,

From aa507d41ffc1900ba3420b6574b14ebc8d63c031 Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Mon, 23 Sep 2024 12:48:02 +0200
Subject: [PATCH 221/341] Fix seq_len for padding sequences (#318)

Before the fix we used seq_len=0 for padding samples. This was later
translated to an empty attention_mask (since we don't have any tokens
that we should include in calculations) and in turn caused NaNs in
prompt attention (0 divided by 0). Those NaNs later got propagated to
kv-cache causing issues in flat_pa.
---
 vllm/worker/habana_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 1d8566e0edff4..f7a3c8569e229 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1268,6 +1268,7 @@ def create_dummy_seq_group_metadata(self,
                                         lora_request=None):
         sampling_params = SamplingParams(temperature=0)
         num_blocks = math.ceil(seq_len / self.block_size)
+        seq_len = max(seq_len, 1)
         if is_prompt:
             input_len = seq_len
             output_len = 0

From a844837032a30ee6f0b76ace9d1937975d20cf6c Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Mon, 23 Sep 2024 11:24:46 +0300
Subject: [PATCH 222/341] Fix lora specific conditions in profile-run

---
 vllm/worker/habana_model_runner.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 1d8566e0edff4..7efe71ea6ba19 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1294,7 +1294,8 @@ def profile_run(self) -> None:
         max_seq_len = min(self.prompt_seq_bucket_cfg[-1],
                           self.max_num_batched_tokens // max_batch_size)
 
-        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches)
+        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
+                             False, True)
         return
 
     def warmup_scenario(self,
@@ -1302,7 +1303,8 @@ def warmup_scenario(self,
                         seq_len,
                         is_prompt,
                         kv_caches,
-                        is_profile_run=False) -> None:
+                        is_pt_profiler_run=False,
+                        is_lora_profile_run=False) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
         scenario_name = ("warmup_"
                          f"{'prompt' if is_prompt else 'decode'}_"
@@ -1316,7 +1318,7 @@ def warmup_scenario(self,
         # passed in, which contains a lora from the lora warmup path.
         dummy_lora_requests: List[LoRARequest] = []
         dummy_lora_requests_per_seq: List[LoRARequest] = []
-        if self.lora_config and is_profile_run:
+        if self.lora_config and is_lora_profile_run:
             assert self.lora_manager is not None
             with self.lora_manager.dummy_lora_cache():
                 for idx in range(self.lora_config.max_loras):
@@ -1334,8 +1336,8 @@ def warmup_scenario(self,
                     for idx in range(max_num_seqs)
                 ]
         self.profiler.start('internal', scenario_name)
-        times = 3 if use_graphs or is_profile_run else 1
-        if self.lora_config and not is_profile_run:
+        times = 3 if use_graphs or is_pt_profiler_run else 1
+        if self.lora_config and not is_lora_profile_run:
             lora_mapping = LoRAMapping(
                 [0] * batch_size * seq_len,
                 [0] * batch_size * seq_len,
@@ -1366,7 +1368,7 @@ def warmup_scenario(self,
             ]
         torch.hpu.synchronize()
         profiler = None
-        if is_profile_run and self.is_driver_worker:
+        if is_pt_profiler_run and self.is_driver_worker:
             profiler = setup_profiler()
             profiler.start()
         for _ in range(times):

From 9bb65b727b37030ed0d2ae489a7d46b8ab72d217 Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Mon, 23 Sep 2024 15:04:17 +0200
Subject: [PATCH 223/341] Run with HPU graphs even when warmup was skipped
 (#320)

Before that PR we relied on stored information which configuration
should have HPU graphs enabled. Unfortunately that set was computed
during warmup. If we skipped warmup we didn't had that information. This
PR allows to run all buckets with HPU graphs enabled when warmup is
skipped.
---
 vllm/worker/habana_model_runner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 57cab468bef3a..f7f9990c1370f 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -574,6 +574,9 @@ def _set_gc_threshold(self) -> None:
         self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 
+        self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP',
+                                          'false').lower() == 'true'
+
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
         if self.model_config.quantization == 'inc':
@@ -647,6 +650,8 @@ def load_model(self) -> None:
     def _use_graphs(self, batch_size, seq_len, is_prompt):
         if self.enforce_eager:
             return False
+        if self.skip_warmup:
+            return True
         return (batch_size, seq_len, is_prompt) in self.graphed_buckets
 
     def _is_valid_bucket(self, bucket):
@@ -1501,7 +1506,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
                                  True)
             raise AssertionError("Finished profiling")
-        if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true':
+        if self.skip_warmup:
             logger.info("Skipping warmup...")
             return
         self.profiler.start('internal', 'warmup')

From 2a499c7bb9bd458f01597d2de0a4e512fba6b8ab Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 16:41:22 +0300
Subject: [PATCH 224/341] mixtral api fixes

---
 vllm/model_executor/layers/fused_moe/layer.py | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 179a8609a17f4..421fbe7187dfa 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -120,24 +120,23 @@ def forward_cuda(
                              inplace=True)
 
     def forward_hpu(
-        self,
-        x: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool,
-        num_expert_group: Optional[int],
-        topk_group: Optional[int],
-        layer: Optional[torch.nn.Module],
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            use_grouped_topk: bool,
+            top_k: int,
+            router_logits: torch.Tensor,
+            renormalize: bool,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            custom_routing_function: Optional[Callable] = None
     ):
         assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
         assert num_expert_group is None, ('num_expert_group is '
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
         if layer is not None:
-            return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k)
+            return layer.hpu_static_fused_moe(x, layer.w13_weight, layer.w2_weight, router_logits, top_k)
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(

From 93727344e3e8218575cbe72c7b035c3585382539 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 16:44:04 +0300
Subject: [PATCH 225/341] revert debug prints

---
 vllm/worker/habana_worker.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 8ba47373fb47b..f2678c5e405dc 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -345,7 +345,6 @@ def init_worker_distributed_environment(
     distributed_init_method: Optional[str] = None,
     local_rank: int = -1,
 ) -> None:
-    print("Initializing TP...")
     """Initialize the distributed environment."""
     backend = hpu_backend_string()
     init_distributed_environment(parallel_config.world_size,
@@ -353,15 +352,12 @@ def init_worker_distributed_environment(
                                  distributed_init_method,
                                  local_rank,
                                  backend=backend)
-    print(f"init_distributed_environment with backend {backend} and distributed_init_method {distributed_init_method} done!")
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
-    print("ensure_model_parallel_initialized done!")
 
     if torch.distributed.is_initialized():
         torch_world_size = torch.distributed.get_world_size()
-        print(f"torch.distributed is already initialized, torch_world_size: {torch_world_size}")
         if torch_world_size != parallel_config.world_size:
             raise RuntimeError(
                 "torch.distributed is already initialized but the torch world "
@@ -372,7 +368,6 @@ def init_worker_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
-        print(f"torch.distributed is not initialized, initializing world_size: {parallel_config.world_size}")
         backend = hpu_backend_string()
         torch.distributed.init_process_group(
             backend=backend,
@@ -380,24 +375,14 @@ def init_worker_distributed_environment(
             rank=rank,
             init_method=distributed_init_method,
         )
-        print(f"torch.distributed initialized!")
 
     # A small all_reduce for warmup & checking conformance.
     device = hpu_device_string()
     dummy_tensor_hpu = torch.ones(1).to(device)
-    torch.hpu.synchronize()
-    print(f"testing allreduce...")
-    htorch.core.mark_step()
-    print(f"testing allreduce...")
     torch.distributed.all_reduce(dummy_tensor_hpu)
-    htorch.core.mark_step()
-    torch.hpu.synchronize()
-    print(f"allreduce done, checking result...")
     assert dummy_tensor_hpu.item() == parallel_config.world_size
-    print(f"allreduce works fine!!")
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
-    print("TP initialized successfully!!")
 
 
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size,

From c15ddd22fc965e85738faf5e0d377a9b56770898 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 16:48:03 +0300
Subject: [PATCH 226/341] format.sh

---
 vllm/executor/ray_habana_executor.py          | 31 +++++++------------
 vllm/model_executor/layers/fused_moe/layer.py | 26 ++++++++--------
 2 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 887cd0f1029ce..645bceb1af446 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -2,7 +2,8 @@
 import os
 from collections import defaultdict
 from itertools import islice, repeat
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
+                    Type)
 
 import msgspec
 
@@ -15,8 +16,8 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
-                        get_ip, get_open_port, get_vllm_instance_id, is_fake_hpu,
-                        make_async)
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        is_fake_hpu, make_async)
 from vllm.worker.worker_base import WorkerBase
 
 if ray is not None:
@@ -78,12 +79,16 @@ def shutdown(self) -> None:
             self.forward_dag = None
 
     def _get_worker_module_and_class(
-            self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]:  # noqa: F821
+        self
+    ) -> Tuple[str, str, Optional[Callable[[],
+                                           Type[WorkerBase]]]]:  # noqa: F821
         worker_class_fn = None
         if self.scheduler_config.is_multi_step:
-            raise NotImplementedError("Multi-step execution is not implemented for HPU")
+            raise NotImplementedError(
+                "Multi-step execution is not implemented for HPU")
         elif self.speculative_config:
-            raise NotImplementedError("Speculative decoding is not implemented for HPU")
+            raise NotImplementedError(
+                "Speculative decoding is not implemented for HPU")
         else:
             worker_module_name = "vllm.worker.habana_worker"
             worker_class_name = "HabanaWorker"
@@ -100,10 +105,6 @@ def _get_worker_wrapper_args(self) -> Dict[str, Any]:
             trust_remote_code=self.model_config.trust_remote_code,
         )
 
-    # child class could overwrite this to return actual env vars.
-    def _get_env_vars_to_be_updated(self):
-        return self._env_vars_for_all_workers
-
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
         # Otherwise, the ray workers are allocated with a full GPU.
@@ -120,10 +121,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         # the TP group of workers for a PP rank.
         self.pp_tp_workers: List[List[RayWorkerWrapper]] = []
 
-        if self.parallel_config.ray_workers_use_nsight:
-            ray_remote_kwargs = self._configure_ray_workers_use_nsight(
-                ray_remote_kwargs)
-
         logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
 
         # Create the workers.
@@ -443,8 +440,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
         from ray.dag import InputNode, MultiOutputNode
         from ray.experimental.channel.torch_tensor_type import TorchTensorType
 
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
@@ -469,9 +464,7 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                     # Specify how intermediate tensors should be passed
                     # between pp stages, no need to specify for the last
                     # pp stage.
-                    transport = "nccl" \
-                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
-                        else "auto"
+                    transport = "auto"
                     outputs = [
                         output.with_type_hint(
                             TorchTensorType(transport=transport))
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 421fbe7187dfa..da374de26a991 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -119,24 +119,24 @@ def forward_cuda(
                              topk_ids=topk_ids,
                              inplace=True)
 
-    def forward_hpu(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            use_grouped_topk: bool,
-            top_k: int,
-            router_logits: torch.Tensor,
-            renormalize: bool,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None
-    ):
+    def forward_hpu(self,
+                    layer: torch.nn.Module,
+                    x: torch.Tensor,
+                    use_grouped_topk: bool,
+                    top_k: int,
+                    router_logits: torch.Tensor,
+                    renormalize: bool,
+                    topk_group: Optional[int] = None,
+                    num_expert_group: Optional[int] = None,
+                    custom_routing_function: Optional[Callable] = None):
         assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
         assert num_expert_group is None, ('num_expert_group is '
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
         if layer is not None:
-            return layer.hpu_static_fused_moe(x, layer.w13_weight, layer.w2_weight, router_logits, top_k)
+            return layer.hpu_static_fused_moe(x, layer.w13_weight,
+                                              layer.w2_weight, router_logits,
+                                              top_k)
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(

From 3bb593a137c4c2c05890451d735629925c6909ac Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:02:09 +0300
Subject: [PATCH 227/341] use ray for hpu distributed inference

---
 vllm/config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index f019736d1dc45..b8ec23e030ac9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -855,6 +855,13 @@ def __init__(
                 raise ValueError(
                     "TPU backend only supports Ray for distributed inference.")
 
+        if current_platform.is_hpu() and self.world_size > 1:
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "ray"
+            if self.distributed_executor_backend != "ray":
+                raise ValueError(
+                    "HPU backend only supports Ray for distributed inference.")
+
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.

From 2f23cb7ea0850509ade5148041b792d44a38e8b2 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:36:42 +0300
Subject: [PATCH 228/341] prune the easy parts

---
 .github/workflows/clang-format.yml |   6 +-
 .github/workflows/mypy.yaml        |   6 +-
 .github/workflows/ruff.yml         |   6 +-
 .github/workflows/yapf.yml         |   7 +-
 README.md                          |   3 +-
 README_GAUDI.md                    | 515 -----------------------------
 tests/conftest.py                  |   8 -
 tests/lora/conftest.py             |   8 +-
 tests/lora/utils.py                |  11 +-
 tests/samplers/test_sampler.py     |  62 +---
 10 files changed, 20 insertions(+), 612 deletions(-)
 delete mode 100644 README_GAUDI.md

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 77fb678c8acc8..d5f37396e69d7 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -2,13 +2,13 @@ name: clang-format
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the habana_main branch
+  # but only for the main branch
   push:
     branches:
-      - habana_main
+      - main
   pull_request:
     branches:
-      - habana_main
+      - main
 
 jobs:
   clang-format:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 6ebe512c5dbf6..ea767f4c3e264 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -2,13 +2,13 @@ name: mypy
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the habana_main branch
+  # but only for the main branch
   push:
     branches:
-      - habana_main
+      - main
   pull_request:
     branches:
-      - habana_main
+      - main
 
 jobs:
   ruff:
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index c9ef4a36745b0..90735d6e2bbf9 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -2,13 +2,13 @@ name: ruff
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the habana_main branch
+  # but only for the main branch
   push:
     branches:
-      - habana_main
+      - main
   pull_request:
     branches:
-      - habana_main
+      - main
 
 jobs:
   ruff:
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index b1002578610d4..c89f82dfaaaf6 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -2,14 +2,13 @@ name: yapf
 
 on:
   # Trigger the workflow on push or pull request,
-  # but only for the habana_main branch
+  # but only for the main branch
   push:
     branches:
-      - habana_main
+      - main
   pull_request:
     branches:
-      - habana_main
-
+      - main
 jobs:
   yapf:
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
index adaa3dc26783f..53749cb36b972 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="README_GAUDI.md"><b>Intel® Gaudi® README</b></a> | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> |
 
 </p>
 
@@ -30,7 +30,6 @@ Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
 - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
 - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
 - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
-- [2024/05] <b>vLLM-fork specific:</b> Added Intel® Gaudi® 2 support with SynapseAI 1.16.0. For more information, please refer to <a href="README_GAUDI.md"><b>Intel® Gaudi® README</b></a>.
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
diff --git a/README_GAUDI.md b/README_GAUDI.md
deleted file mode 100644
index 0ef30d5f96e64..0000000000000
--- a/README_GAUDI.md
+++ /dev/null
@@ -1,515 +0,0 @@
-vLLM with Intel® Gaudi® AI Accelerators
-=======================================
-
-This README provides instructions on running vLLM with Intel Gaudi
-devices.
-
-Requirements and Installation
-=============================
-
-Please follow the instructions provided in the [Gaudi Installation
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the environment. To achieve the best performance, please
-follow the methods outlined in the [Optimizing Training Platform
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-Requirements
-------------
-
--   OS: Ubuntu 22.04 LTS
--   Python: 3.10
--   Intel Gaudi accelerator
--   Intel Gaudi software version 1.17.0
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-``` {.console}
-$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
-$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-$ pip list | grep neural # verify that neural-compressor is installed
-```
-
-Refer to [Intel Gaudi Software Stack
-Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-Run Docker Image
-----------------
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
-documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-``` {.console}
-$ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
-```
-
-Build and Install vLLM
-----------------------
-
-Currently, the latest features and performance optimizations are
-developed in Gaudi\'s [vLLM-fork](https://github.com/HabanaAI/vllm-fork)
-and we periodically upstream them to vLLM main repo. To install latest
-[HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the
-following:
-
-``` {.console}
-$ git clone https://github.com/HabanaAI/vllm-fork.git
-$ cd vllm-fork
-$ git checkout habana_main
-$ pip install -e .
-```
-
-Supported Features
-==================
-
--   [Offline batched
-    inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference)
--   Online inference via [OpenAI-Compatible
-    Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server)
--   HPU autodetection - no need to manually select device within vLLM
--   Paged KV cache with algorithms enabled for Intel Gaudi accelerators
--   Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-    prefill attention, Root Mean Square Layer Normalization, Rotary
-    Positional Encoding
--   Tensor parallelism support for multi-card inference
--   Inference with [HPU
-    Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-    for accelerating low-batch latency and throughput
--   INC quantization
-
-Unsupported Features
-====================
-
--   Beam search
--   LoRA adapters
--   Attention with Linear Biases (ALiBi)
--   AWQ quantization
--   Prefill chunking (mixed-batch inferencing)
-
-Supported Configurations
-========================
-
-The following configurations have been validated to be function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
--   [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
-    on single HPU or with tensor parallelism on 2x HPU, BF16 datatype
-    with random or greedy sampling
--   [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
-    with tensor parallelism on 2x HPU, BF16 datatype with random or
-    greedy sampling
-
-Performance Tuning 
-================
-
-Execution modes 
------------------------------
-
-Currently in vLLM for HPU we support four execution modes, depending on
-selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment
-variable), and `--enforce-eager` flag.
-
-| `PT_HPU_LAZY_MODE` 	| `enforce_eager` 	| execution mode |
-|---	|---	|---	|
-| 0 	| 0 	| torch.compile 	|
-| 0 	| 1 	| PyTorch eager mode 	|
-| 1 	| 0 	| HPU Graphs 	|
-| 1 	| 1 	| PyTorch lazy mode 	|
-
-
-> [!WARNING]
-> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly
-> experimental and should be only used for validating functional
-> correctness. Their performance will be improved in the next releases.
-> For obtaining the best performance in 1.17.0, please use HPU Graphs, or
-> PyTorch lazy mode.
-
-Bucketing mechanism 
------------------------------
-
-Intel Gaudi accelerators work best when operating on models with fixed
-tensor shapes. [Intel Gaudi Graph
-Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime)
-is responsible for generating optimized binary code that implements the
-given model topology on Gaudi. In its default configuration, the
-produced binary code may be heavily dependent on input and output tensor
-shapes, and can require graph recompilation when encountering
-differently shaped tensors within the same topology. While the resulting
-binaries utilize Gaudi efficiently, the compilation itself may introduce
-a noticeable overhead in end-to-end execution. In a dynamic inference
-serving scenario, there is a need to minimize the number of graph
-compilations and reduce the risk of graph compilation occurring during
-server runtime. Currently it is achieved by \"bucketing\" model\'s
-forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-> [!NOTE] 
-> Bucketing allows us to reduce the number of required graphs
-> significantly, but it does not handle any graph compilation and device
-> code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and
-`max`. They can be set separately for prompt and decode phase, and for
-batch size and sequence length dimension. These parameters can be
-observed in logs during vLLM startup:
-
-``` {.}
-INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-`min` determines the lowest value of the bucket. `step` determines the
-interval between buckets, and `max` determines the upper bound of the
-bucket. Furthermore, interval between `min` and `step` has special
-handling - `min` gets multiplied by consecutive powers of two, until
-`step` gets reached. We call this the ramp-up phase and it is used for
-handling lower batch sizes with minimum wastage, while allowing larger
-padding on larger batch sizes.
-
-Example (with ramp-up)
-
-``` {.}
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up)
-
-``` {.}
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill)
-runs, and 48 buckets for decode runs. Each bucket corresponds to a
-separate optimized device binary for a given model with specified tensor
-shapes. Whenever a batch of requests is processed, it is padded across
-batch and sequence length dimension to the smallest possible bucket.
-
-> [!WARNING]
-> If a request exceeds maximum bucket size in any dimension, it will be
-> processed without padding, and its processing may require a graph
-> compilation, potentially significantly increasing end-to-end latency.
-> The boundaries of the buckets are user-configurable via environment
-> variables, and upper bucket boundaries can be increased to avoid such
-> scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of
-412 comes in to an idle vLLM server, it will be padded executed as
-`(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be
-padded to 4 (closest batch\_size dimension higher than 3), and max
-sequence length will be padded to 512 (closest sequence length dimension
-higher than 412). After prefill stage, it will be executed as `(4, 512)`
-decode bucket and will continue as that bucket until either batch
-dimension changes (due to request being finished) - in which case it
-will become a `(2, 512)` bucket, or context length increases above 512
-tokens, in which case it will become `(4, 640)` bucket.
-
-> [!NOTE]
-> Bucketing is transparent to a client - padding in sequence length
-> dimension is never returned to the client, and padding in batch
-> dimension does not create new requests.
-
-Warmup
-------
-
-Warmup is an optional, but highly recommended step occurring before vLLM
-server starts listening. It executes a forward pass for each bucket with
-dummy data. The goal is to pre-compile all graphs and not incur any
-graph compilation overheads within bucket boundaries during server
-runtime. Each warmup step is logged during vLLM startup:
-
-``` {.}
-INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-...
-INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-...
-INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-```
-
-This example uses the same buckets as in *Bucketing mechanism* section.
-Each output line corresponds to execution of a single bucket. When
-bucket is executed for the first time, its graph is compiled and can be
-reused later on, skipping further graph compilations.
-
-> [!TIP]
-> Compiling all the buckets might take some time and can be turned off
-> with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if
-> you do that, you may face graph compilations once executing a given
-> bucket for the first time. It is fine to disable warmup for development,
-> but it\'s highly recommended to enable it in deployment.
-
-HPU Graph capture
------------------------------
-
-[HPU
-Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-are currently the most performant execution method of vLLM on Intel
-Gaudi. When HPU Graphs are enabled, execution graphs will be traced
-(recorded) ahead of time (after performing warmup), to be later replayed
-during inference, significantly reducing host overheads. Recording can
-take large amounts of memory, which needs to be taken into account when
-allocating KV cache. Enabling HPU Graphs will impact the number of
-available KV cache blocks, but vLLM provides user-configurable variables
-to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool
-(\"usable memory\") as KV cache, determined by `gpu_memory_utilization`
-flag (`0.9` by default). Before KV cache gets allocated, model weights
-are loaded onto the device, and a forward pass of the model is executed
-on dummy data, to estimate memory usage. Only after that,
-`gpu_memory_utilization` flag is utilized - at its default value, will
-mark 90% of free device memory at that point as usable. Next, KV cache
-gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
-memory reserved for HPU Graphs capture. With its default value
-(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved
-for graph capture (later referred to as \"usable graph memory\"), and
-the remaining 90% will be utilized for KV cache. Environment variable
-`VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
-reserved for prefill and decode graphs. By default
-(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
-constraints. Lower value corresponds to less usable graph memory
-reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will
-reserve 20% of usable graph memory for prefill graphs, and 80% of usable
-graph memory for decode graphs.
-
-> [!NOTE]
-> `gpu_memory_utilization` does not correspond to the absolute memory
-> usage across HPU. It specifies the memory margin after loading the model
-> and performing a profile run. If device has 100 GiB of total memory, and
-> 50 GiB of free memory after loading model weights and executing
-> profiling run, `gpu_memory_utilization` at its default value will mark
-> 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total
-> device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt
-and decode stages separately. Strategy affects the order of capturing
-graphs. There are two strategies implemented: - `max_bs` - graph capture
-queue will sorted in descending order by their batch sizes. Buckets with
-equal batch sizes are sorted by sequence length in ascending order (e.g.
-`(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`,
-`(1,256)`), default strategy for decode - `min_tokens` - graph capture
-queue will be sorted in ascending order by the number of tokens each
-graph processes (`batch_size*sequence_length`), default strategy for
-prompt
-
-When there\'s large amount of requests pending, vLLM scheduler will
-attempt to fill the maximum batch size for decode as soon as possible.
-When a request is finished, decode batch size decreases. When that
-happens, vLLM will attempt to schedule a prefill iteration for requests
-in the waiting queue, to fill the decode batch size to its previous
-state. This means that in a full load scenario, decode batch size is
-often at its maximum, which makes large batch size HPU Graphs crucial to
-capture, as reflected by `max_bs` strategy. On the other hand, prefills
-will be executed most frequently with very low batch sizes (1-4), which
-is reflected in `min_tokens` strategy.
-
-> [!NOTE]
-> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by
-> graphs for each stage (prefill and decode). vLLM will first attempt to
-> use up entirety of usable prefill graph memory (usable graph memory \*
-> `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it
-> will attempt do the same for decode graphs and usable decode graph
-> memory pool. If one stage is fully captured, and there is unused memory
-> left within usable graph memory pool, vLLM will attempt further graph
-> capture for the other stage, until no more HPU Graphs can be captured
-> without exceeding reserved memory pool. The behavior on that mechanism
-> can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative
-values correspond to memory being released):
-
-``` {.}
-INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-...
-INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
-INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-...
-INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-...
-INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-```
-
-Recommended vLLM Parameters
------------------------------
-
--   We recommend running inference on Gaudi 2 with `block_size` of 128
-    for BF16 data type. Using default values (16, 32) might lead to
-    sub-optimal performance due to Matrix Multiplication Engine
-    under-utilization (see [Gaudi
-    Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
--   For max throughput on Llama 7B, we recommend running with batch size
-    of 128 or 256 and max context length of 2048 with HPU Graphs
-    enabled. If you encounter out-of-memory issues, see troubleshooting
-    section.
-
-Environment variables
------------------------------
-
-**Diagnostic and profiling knobs:**
-
--   `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be
-    enabled. Resulting JSON traces can be viewed in
-    [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled
-    by default.
--   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph
-    compilations per each vLLM engine step, only when there was any -
-    highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`.
-    Disabled by default.
--   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph
-    compilations per each vLLM engine step, always, even if there were
-    none. Disabled by default.
--   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks
-    per each vLLM engine step, only when there was any. Disabled by
-    default.
--   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu
-    fallbacks per each vLLM engine step, always, even if there were
-    none. Disabled by default.
-
-**Performance tuning knobs:**
-
--   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
-    default
--   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
-    HPUGraph capture, `0.1` by default
--   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
-    dedicated for prompt graphs, `0.5` by default
--   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
-    graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
--   `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode
-    graph capture, `min_tokens` or `max_bs`, `max_bs` by default
--   `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment
-    variables configuring ranges of bucketing mechanism
-    -   `{phase}` is either `PROMPT` or `DECODE`
-    -   `{dim}` is either `BS`, `SEQ` or `BLOCK`
-    -   `{param}` is either `MIN`, `STEP` or `MAX`
-    -   Default values:
-        - Prompt:
-           -   batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-           -   batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-           -   batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`):
-                    `min(max_num_seqs, 64)`
-           -   sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`):
-                    `block_size`
-           -   sequence length step
-                    (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-           -   sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`):
-                    `max_model_len`
-
-        - Decode:
-            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)`
-            -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
-                    `min(max_num_seqs, 32)`
-            -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
-                    `max_num_seqs`
-            -   block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`):
-                    `128`
-            -   block size step
-                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128`
-            -   block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`):
-                    `max(128, (max_num_seqs*max_model_len)/block_size)`
-
-Additionally, there are HPU PyTorch Bridge environment variables
-impacting vLLM execution:
-
--   `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be
-    used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is
-    default
--   `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor
-    parallel inference with HPU Graphs
-
-Troubleshooting: Tweaking HPU Graphs
-====================================
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
--   Tweak `gpu_memory_utilization` knob. It will decrease the allocation
-    of KV cache, leaving some headroom for capturing graphs with larger
-    batch size. By default `gpu_memory_utilization` is set to 0.9. It
-    attempts to allocate \~90% of HBM left for KV cache after short
-    profiling run. Note that decreasing reduces the number of KV cache
-    blocks you have available, and therefore reduces the effective
-    maximum number of tokens you can handle at a given time.
--   If this method is not efficient, you can disable `HPUGraph`
-    completely. With HPU Graphs disabled, you are trading latency and
-    throughput at lower batches for potentially higher throughput on
-    higher batches. You can do that by adding `--enforce-eager` flag to
-    server (for online inference), or by passing `enforce_eager=True`
-    argument to LLM constructor (for offline inference).
diff --git a/tests/conftest.py b/tests/conftest.py
index f799d1bc11afc..c2616bcf7091c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -849,19 +849,11 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
     yield caplog
 
 
-def is_hpu():
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
-
 @pytest.fixture(scope="session")
 def num_gpus_available():
     """Get number of GPUs without initializing the CUDA context
     in current process."""
 
-    if is_hpu():
-        return torch.hpu.device_count()
-
     return cuda_device_count_stateless()
 
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d3ebd15510284..4834a9d35a3ee 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -48,19 +48,13 @@ class ContextInfo(TypedDict):
 }]
 
 
-def is_hpu():
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
-
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    if not is_hpu():
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()
     ray.shutdown()
 
 
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index d544d50ccb469..00f8e26d1041f 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -3,7 +3,6 @@
 import torch
 
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.utils import get_device
 
 
 class DummyLoRAManager:
@@ -29,16 +28,16 @@ def init_random_lora(self,
             lora_alpha=1,
             lora_a=torch.rand([weight.shape[1], rank],
                               dtype=weight.dtype,
-                              device=get_device()),
+                              device="cuda"),
             lora_b=torch.rand([rank, weight.shape[0]],
                               dtype=weight.dtype,
-                              device=get_device()),
+                              device="cuda"),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(5,
                                                 generate_embeddings_tensor,
                                                 dtype=weight.dtype,
-                                                device=get_device())
+                                                device="cuda")
         self.set_module_lora(module_name, lora)
 
         return lora
@@ -54,8 +53,8 @@ def init_lora(self,
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([input_dim, rank], device=get_device()),
-            lora_b=torch.rand([rank, output_dim], device=get_device()),
+            lora_a=torch.rand([input_dim, rank], device="cuda"),
+            lora_b=torch.rand([rank, output_dim], device="cuda"),
             embeddings_tensor=embeddings_tensor,
         )
         self.set_module_lora(module_name, lora)
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 3607f2f943738..308b708feab71 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -8,7 +8,7 @@
 from transformers import GenerationConfig, GenerationMixin
 
 import vllm.envs as envs
-from vllm.model_executor.layers.sampler import ApplyToppTopkScalar, Sampler
+from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
@@ -768,63 +768,3 @@ def test_sampler_include_gpu_probs_tensor(device: str):
     assert sampler_output.sampled_token_probs is not None
     assert sampler_output.logprobs is not None
     assert sampler_output.sampled_token_ids is not None
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_topk_topk_scalar():
-    obj1 = ApplyToppTopkScalar(2)
-    assert ApplyToppTopkScalar._padded_k == 0
-    x = torch.tensor([[9, 9, 8, 8, 8, 8, 7, 7, 7.0],
-                      [10, 10, 9, 9, 9, 8, 5, 5, 5]])
-
-    retval1 = obj1(x, p=0.9, k=5)
-    ninf = -float("inf")
-    expected1 = torch.tensor([[9., 9., 8., 8., 8., 8., ninf, ninf, ninf],
-                              [10., 10., 9., 9., 9., ninf, ninf, ninf, ninf]])
-    assert torch.all(retval1 == expected1).item()
-    assert ApplyToppTopkScalar._padded_k == 9
-
-    obj2 = ApplyToppTopkScalar(2)
-    assert obj2._padded_k == 9
-
-    x = torch.tensor([[2, 2, 9, 9, 2, 2, 1, 1, 1.0],
-                      [10, 9, 9, 5, 9, 9, 5, 9, 10]])
-    retval2 = obj2(x, p=0.9, k=5)
-    expected2 = torch.tensor(
-        [[ninf, ninf, 9., 9., ninf, ninf, ninf, ninf, ninf],
-         [10., ninf, 9., ninf, 9., 9., ninf, 9., 10.]])
-    assert torch.all(retval2 == expected2).item()
-    assert obj2._padded_k == 9
-
-    retval3 = obj2(x, p=1.0, k=5)
-    expected3 = torch.tensor([[2., 2., 9., 9., 2., 2., ninf, ninf, ninf],
-                              [10., 9., 9., ninf, 9., 9., ninf, 9., 10.]])
-
-    assert torch.all(retval3 == expected3).item()
-
-    # this should not be done in general, doing it here for testing purposes
-    ApplyToppTopkScalar._padded_k = 0
-    x = torch.tensor([[1, 1, 1, 9, 8, 1, 1, 1, 1.0],
-                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
-    obj3 = ApplyToppTopkScalar(2)
-    retval4 = obj3(x, p=0.9, k=2)
-    expected4 = torch.tensor(
-        [[ninf, ninf, ninf, 9., 8., ninf, ninf, ninf, ninf],
-         [2., ninf, 2., 2., ninf, ninf, ninf, ninf, ninf]])
-    assert torch.all(retval4 == expected4).item()
-    assert obj3._padded_k == 4
-    y = torch.tensor([[8, 8, 8, 9, 8, 1, 1, 1, 1.0],
-                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
-    retval5 = obj3(y, p=0.9, k=2)
-    assert obj3._padded_k == 8
-    expected5 = torch.tensor([[8., 8., 8., 9., 8., ninf, ninf, ninf, ninf],
-                              [2., ninf, 2., 2., ninf, ninf, ninf, ninf,
-                               ninf]])
-    assert torch.all(retval5 == expected5).item()
-    y = torch.tensor([[8, 8, 8, 9, 8, 8, 1, 1, 1.0],
-                      [2, 1, 2, 2, 3, 1, 1, 1, 1]])
-    retval6 = obj3(y, p=0.9, k=2)
-    expected6 = torch.tensor([[8., 8., 8., 9., 8., 8., ninf, ninf, ninf],
-                              [2., ninf, 2., 2., 3., ninf, ninf, ninf, ninf]])
-    assert torch.all(retval6 == expected6).item()
-    assert obj3._padded_k == 8

From 28df6fd061a47b988dd1acd83f6d0e3cfeb521f8 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:41:45 +0300
Subject: [PATCH 229/341] prune more easy parts

---
 .github/workflows/cpu-test.yml        |  34 ----
 examples/lora_inference_hpu.py        |  47 -----
 examples/offline_inference_fakehpu.py |  38 ----
 tests/lora/test_llama_hpu.py          | 100 ----------
 tests/lora/test_lora_hpu.py           | 260 --------------------------
 tests/lora/test_multilora_hpu.py      | 130 -------------
 6 files changed, 609 deletions(-)
 delete mode 100644 .github/workflows/cpu-test.yml
 delete mode 100644 examples/lora_inference_hpu.py
 delete mode 100644 examples/offline_inference_fakehpu.py
 delete mode 100644 tests/lora/test_llama_hpu.py
 delete mode 100644 tests/lora/test_lora_hpu.py
 delete mode 100644 tests/lora/test_multilora_hpu.py

diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
deleted file mode 100644
index 89a702f9751d9..0000000000000
--- a/.github/workflows/cpu-test.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: cpu-test
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the habana_main branch
-  push:
-    branches:
-      - habana_main
-  pull_request:
-    branches:
-      - habana_main
-
-
-jobs:
-  cputest:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements-hpu.txt
-        VLLM_TARGET_DEVICE=hpu python setup.py develop
-    - name: cpu-test
-      run: |
-        VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py
diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py
deleted file mode 100644
index b8154a29a82bb..0000000000000
--- a/examples/lora_inference_hpu.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from huggingface_hub import snapshot_download
-
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-
-sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-
-llm = LLM(model="meta-llama/Llama-2-7b-hf",
-          enable_lora=True,
-          max_num_seqs=2,
-          dtype='bfloat16')
-
-sampling_params = SamplingParams(temperature=0,
-                                 max_tokens=1024,
-                                 stop=["[/assistant]"])
-
-prompts = [
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-    "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
-]
-
-expected_output = [
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ",  # noqa: E501
-    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
-]
-
-outputs = llm.generate(prompts,
-                       sampling_params,
-                       lora_request=LoRARequest("sql_adapter", 1,
-                                                sql_lora_path))
-
-for i, output in enumerate(outputs):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    match = expected_output[i] == generated_text
-    if not match:
-        print(
-            f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}"  # noqa: E501
-        )
diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py
deleted file mode 100644
index 972d84b60b318..0000000000000
--- a/examples/offline_inference_fakehpu.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import os
-
-from vllm import LLM, SamplingParams
-
-if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0':
-    from vllm.utils import migrate_to_cpu
-    migrate_to_cpu()
-
-# Sample prompts.
-prompts = [
-    "Berlin is the capital city of ",
-    "Louvre is located in the city of ",
-    "Barack Obama was the 44th president of ",
-    "Warsaw is the capital city of ",
-    "Gniezno is a city in ",
-    "San Francisco is located in the state of ",
-    "Llanfairpwllgwyngyll is located in country of ",
-]
-ref_answers = [
-    "Germany", "Paris", "United States", "Poland", "Poland", "California",
-    "Wales"
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False)
-
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output, answer in zip(outputs, ref_answers):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    assert answer in generated_text, (
-        f"The generated text does not contain the correct answer: {answer}")
-print('PASSED')
diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py
deleted file mode 100644
index dfd551f2ca043..0000000000000
--- a/tests/lora/test_llama_hpu.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from multiprocessing import Process
-from typing import List
-
-from conftest import cleanup
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def _test_llama_lora(sql_lora_files, tp_size):
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   dtype='float32',
-                   tensor_parallel_size=tp_size)
-
-    expected_no_lora_output = [
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
-        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
-    ]
-    expected_lora_output = [
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
-        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
-    ]
-
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
-
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
-
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
-
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
-
-    print("removing lora")
-    cleanup()
-
-
-def test_llama_lora_1x(sql_lora_files):
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 1))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-
-def test_llama_lora_2x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 2))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-
-def test_llama_lora_4x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 4))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
deleted file mode 100644
index a59cfe875ef9c..0000000000000
--- a/tests/lora/test_lora_hpu.py
+++ /dev/null
@@ -1,260 +0,0 @@
-import pytest
-import torch
-from vllm_hpu_extension.ops import LoraMask
-
-from vllm.hpu.punica_hpu import GaudiPunicaWrapper
-
-from .utils import DummyLoRAManager
-
-TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
-QKV_TENSOR_SIZES = [
-    (8192, 1024, 1024),
-    (8192 // 8, 1024 // 8, 1024 // 8),
-    (4096, 4096, 4096),
-    (4096 // 2, 4096 // 2, 4096 // 2),
-]
-BATCH_SIZES = [8, 32, 256]
-RANKS = [8]
-DTYPES = [torch.bfloat16]
-TOLERANCES = {
-    torch.float16: (5e-3, 5e-3),
-    torch.bfloat16: (3e-2, 2e-2),
-}
-
-
-def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank,
-                   lora_dtype):
-    indices = indices.view(-1, 1)
-    mask = torch.arange(max_loras * max_lora_rank, device=indices.device)
-    mask = mask.view(1, -1)
-    mask = ((mask >= ((indices) * max_lora_rank)) *
-            (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype)
-    mask = mask.view(batch_size, 1,
-                     -1).expand(batch_size, seq_len,
-                                -1).reshape(batch_size * seq_len, -1)
-    return mask
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora(m, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m, n], device="hpu", dtype=dtype)
-
-    manager.init_random_lora(module_name, weight, rank=rank)
-    lora = manager.get_module_lora(module_name)
-
-    input = torch.rand(k, n, device="hpu", dtype=dtype)
-    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
-
-    lora_a_stack = torch.zeros(8,
-                               1,
-                               lora.lora_a.shape[1],
-                               lora.lora_a.shape[0],
-                               device="hpu",
-                               dtype=dtype)
-    lora_b_stack = torch.zeros(8,
-                               1,
-                               lora.lora_b.shape[1],
-                               lora.lora_b.shape[0],
-                               device="hpu",
-                               dtype=dtype)
-    for i in range(lora_a_stack.shape[0]):
-        lora_a_stack[i][0] = lora.lora_a.T
-        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
-
-    output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    indices = torch.randint(0,
-                            lora_a_stack.shape[0], (len(input), ),
-                            device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
-
-    punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    indices = torch.full((len(input), ), -1, device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-
-    punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("m", TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
-    if m % 2 != 0:
-        pytest.skip("m must be divisible by 2")
-    if m // 2 not in TENSOR_SIZES:
-        pytest.skip("m//2 must be in TENSOR_SIZES")
-
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight = torch.rand([m // 2, n], device="hpu", dtype=dtype)
-
-    manager.init_random_lora(module_name + "1", weight, rank=rank)
-    lora_1 = manager.get_module_lora(module_name + "1")
-    manager.init_random_lora(module_name + "2", weight, rank=rank)
-    lora_2 = manager.get_module_lora(module_name + "2")
-
-    input = torch.rand(k, n, device="hpu", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
-        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_a.shape[1],
-                    lora_1.lora_a.shape[0],
-                    device="hpu",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_1.lora_b.shape[1],
-                    lora_1.lora_b.shape[0],
-                    device="hpu",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_1.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
-        lora_a_stacks[1][i][0] = lora_2.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
-
-    output = torch.zeros(k, m, device="hpu", dtype=dtype)
-    indices = torch.randint(0,
-                            lora_a_stacks[0].shape[0], (len(input), ),
-                            device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-
-    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
-    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
-                                          lora_b_stacks, 1.0, (m // 2, m // 2))
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    indices = torch.full((len(input), ), -1, device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-
-    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
-                                          lora_b_stacks, 1.0, (m // 2, m // 2))
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
-
-
-@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
-@pytest.mark.parametrize("n", TENSOR_SIZES)
-@pytest.mark.parametrize("k", BATCH_SIZES)
-@pytest.mark.parametrize("rank", RANKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
-    manager = DummyLoRAManager()
-
-    module_name = "module"
-    weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype)
-    weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype)
-
-    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
-    lora_q = manager.get_module_lora(module_name + "q")
-    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
-    lora_k = manager.get_module_lora(module_name + "k")
-    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
-    lora_v = manager.get_module_lora(module_name + "v")
-
-    input = torch.rand(k, n, device="hpu", dtype=dtype)
-    expected = torch.cat([
-        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
-        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
-        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
-    ],
-                         dim=1)
-
-    lora_a_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_a.shape[1],
-                    lora_q.lora_a.shape[0],
-                    device="hpu",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_a.shape[1],
-                    lora_k.lora_a.shape[0],
-                    device="hpu",
-                    dtype=dtype) for i in range(2)
-    ]
-    lora_b_stacks = [
-        torch.zeros(8,
-                    1,
-                    lora_q.lora_b.shape[1],
-                    lora_q.lora_b.shape[0],
-                    device="hpu",
-                    dtype=dtype)
-    ] + [
-        torch.zeros(8,
-                    1,
-                    lora_k.lora_b.shape[1],
-                    lora_k.lora_b.shape[0],
-                    device="hpu",
-                    dtype=dtype) for i in range(2)
-    ]
-    for i in range(lora_a_stacks[0].shape[0]):
-        lora_a_stacks[0][i][0] = lora_q.lora_a.T
-        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
-        lora_a_stacks[1][i][0] = lora_k.lora_a.T
-        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
-        lora_a_stacks[2][i][0] = lora_v.lora_a.T
-        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
-
-    output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype)
-    indices = torch.randint(0,
-                            lora_a_stacks[0].shape[0], (len(input), ),
-                            device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-
-    punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu")
-    qkvs = (qkv[0], qkv[1], qkv[2])
-    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
-                                          lora_b_stacks, 1.0, qkvs)
-
-    rtol, atol = TOLERANCES[dtype]
-    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
-
-    output[:] = 0
-    indices = torch.full((len(input), ), -1, device="hpu")
-    mask = createLoraMask(indices, k, 1, 8, rank, dtype)
-    LoraMask.setLoraMask(mask)
-    qkvs = (qkv[0], qkv[1], qkv[2])
-    punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks,
-                                          lora_b_stacks, 1.0, qkvs)
-    assert torch.allclose(torch.zeros_like(output), output)
-
-    manager.reset_lora()
diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py
deleted file mode 100644
index 64eda037ff059..0000000000000
--- a/tests/lora/test_multilora_hpu.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from multiprocessing import Process
-from typing import List, Optional, Tuple
-
-from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
-from vllm.lora.request import LoRARequest
-
-
-def create_test_prompts(
-        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
-    """Create a list of test prompts with their sampling parameters.
-
-    2 requests for base model, 4 requests for the LoRA. We define 2
-    different LoRA adapters (using the same model for demo purposes).
-    """
-    return [
-        ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128), None),
-        ("To be or not to be,",
-         SamplingParams(temperature=0.8,
-                        top_k=5,
-                        presence_penalty=0.2,
-                        max_tokens=128), None),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora2", 2, lora_path)),
-        (
-            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
-    ]
-
-
-def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
-                                              Optional[LoRARequest]]]):
-    """Continuously process a list of prompts and handle the outputs."""
-    request_id = 0
-    result = {}
-
-    while test_prompts or engine.has_unfinished_requests():
-        if test_prompts:
-            prompt, sampling_params, lora_request = test_prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               lora_request=lora_request)
-            request_id += 1
-
-        request_outputs: List[RequestOutput] = engine.step()
-
-        for request_output in request_outputs:
-            if request_output.finished:
-                result[
-                    request_output.request_id] = request_output.outputs[0].text
-    return result
-
-
-expected_output = [
-    " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194",  # noqa: E501
-    " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the",  # noqa: E501
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' "  # noqa: E501
-]
-
-
-def _test_llama_multilora(sql_lora_files, tp_size):
-    """Main function that sets up and runs the prompt processing."""
-    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
-                             enable_lora=True,
-                             max_loras=2,
-                             max_lora_rank=8,
-                             max_num_seqs=256,
-                             dtype='float32',
-                             tensor_parallel_size=tp_size)
-    engine = LLMEngine.from_engine_args(engine_args)
-    test_prompts = create_test_prompts(sql_lora_files)
-    results = process_requests(engine, test_prompts)
-    generated_texts = [results[key] for key in sorted(results)]
-    assert generated_texts == expected_output
-
-
-def test_llama_multilora_1x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-
-def test_llama_multilora_2x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
-
-
-def test_llama_multilora_4x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4))
-    p.start()
-    p.join()
-    assert p.exitcode == 0

From c6d2d5abac6d95e8ff76f9d4f31aba4d4d3fc0ea Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:48:21 +0300
Subject: [PATCH 230/341] prune lora files

---
 vllm/lora/layers.py |  25 +++------
 vllm/lora/models.py | 128 ++------------------------------------------
 vllm/lora/punica.py |  12 ++---
 3 files changed, 17 insertions(+), 148 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index b3758ad883d56..b9ac498b23a7b 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -28,10 +28,6 @@
     LinearScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.platforms import current_platform
-
-if current_platform.is_hpu():
-    from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper
 
 if TYPE_CHECKING:
     pass
@@ -228,7 +224,6 @@ def set_lora(
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embeddings_indices = None
         embeddings_indices = self.punica_wrapper.embeddings_indices
         indices = embeddings_indices[1].view_as(x)
         full_lora_a_embeddings = F.embedding(
@@ -246,19 +241,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if full_lora_a_embeddings.ndim == 3:
             full_lora_a_embeddings = full_lora_a_embeddings.view(
                 full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1], -1)
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+
         # Embedding layer only need expand op
-        if current_platform.is_hpu():
-            assert isinstance(self.punica_wrapper, GaudiPunicaWrapper)
-            self.punica_wrapper.add_lora_embedding(full_output,
-                                                   full_lora_a_embeddings,
-                                                   self.lora_b_stacked,
-                                                   add_input=True)
-        else:
-            self.punica_wrapper.add_expand(full_output,
-                                           full_lora_a_embeddings,
-                                           self.lora_b_stacked,
-                                           add_input=True)
+        self.punica_wrapper.add_expand(full_output,
+                                       full_lora_a_embeddings,
+                                       self.lora_b_stacked,
+                                       add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 546a4c402aedc..bc4cab1470f44 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import safetensors.torch
 import torch
@@ -26,12 +26,8 @@
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.model_executor.models.utils import PPMissingLayer
-from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper
-
 logger = init_logger(__name__)
 
 _GLOBAL_LORA_ID = 0
@@ -49,116 +45,6 @@ class LongContextLoRAContext:
     offsets_by_lora_id: Dict[int, int] = field(default_factory=dict)
 
 
-def convert_mapping(
-    mapping: LoRAMapping,
-    lora_index_to_id: List[Optional[int]],
-    max_loras: int,
-    vocab_size: int,
-    extra_vocab_size: int,
-    long_lora_context: Optional[LongContextLoRAContext] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-           Optional[torch.Tensor], List[int]]:
-    """Converts LoRAMapping to index tensors.
-
-    Args:
-        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
-        lora_index_to_id: List mapping LoRA ids to LoRA indices.
-        max_loras: Maximum number of LoRAs.
-        vocab_size: Model vocab size.
-        extra_vocab_size: Extra vocab size each LoRA can have.
-        long_lora_context: Passed if there are long context lora in a batch.
-
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                LoRA indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
-                to LoRA indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_loras.
-            embeddings_indices: Tensor of shape [2, batch_size] mapping
-                requests to embedding indices. First row is for embeddings
-                added by the LoRAs, second row is for the LoRA.lora_a
-                embeddings.
-            long_lora_indices: Tensor of shape [batch_size] mapping
-                requests to RoPE offsets and rot dims for long LoRAs.
-                None if long context lora doesn't exist.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
-                (base_indices, sampler_indices, sampler_indices_padded,
-                embeddings_indices, long_lora_indices). If long_lora doesn't
-                exist, it only contains first 4 entries.
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    embedding_indices = index_mapping_indices.copy()
-    lora_indices = index_mapping_indices.copy()
-    long_lora_offsets: Optional[torch.Tensor] = None
-    device = "hpu" if current_platform.is_hpu() else "cuda"
-    if long_lora_context:
-        long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=device,
-                                        dtype=torch.long)
-    prompt_mapping: List[int] = [
-        lora_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    lora_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
-                    if index_mapping_indices[i] > 0 else -1)
-        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
-        lora_indices[i] = lora_idx
-        if long_lora_context:
-            assert long_lora_offsets is not None
-            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                index_mapping_indices[i], 0)
-            long_lora_offsets[i] = lora_offset
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, lora_indices, embedding_indices
-    ]
-    if long_lora_context:
-        assert long_lora_offsets is not None
-        indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device=device,
-                                         dtype=torch.long)
-    embeddings_indices = torch.stack([
-        indices[2] * extra_vocab_size,
-        indices[2] * (vocab_size + extra_vocab_size)
-    ])
-    embeddings_indices[embeddings_indices == -1] = max_loras - 1
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device=device, dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
-    long_lora_indices = None
-    long_lora_indices_len: Optional[int] = None
-    if long_lora_context:
-        long_lora_indices = indices[3]
-        long_lora_indices_len = long_lora_indices.shape[-1]
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
-    ]
-    if long_lora_indices_len is not None:
-        indices_len.append(long_lora_indices_len)
-
-    return (base_indices, sampler_indices, sampler_indices_padded,
-            embeddings_indices, long_lora_indices, indices_len)
-
-
 def get_lora_id():
     global _GLOBAL_LORA_ID
     _GLOBAL_LORA_ID += 1
@@ -430,15 +316,9 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        if current_platform.is_hpu():
-            self.punica_wrapper = GaudiPunicaWrapper(
-                max_num_batched_tokens,
-                max_batches=self.max_num_seqs,
-                device="hpu")
-        else:
-            self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
-                                                max_batches=self.max_num_seqs,
-                                                device="cuda")
+        self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
+                                            max_batches=self.max_num_seqs,
+                                            device="cuda")
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 5a2f02ee91456..6d5c834299961 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -10,7 +10,6 @@
 import torch
 
 from vllm.triton_utils import HAS_TRITON
-from vllm.utils import get_device
 
 if HAS_TRITON:
     from vllm.lora.ops.bgmv_expand import bgmv_expand
@@ -105,7 +104,7 @@ def convert_mapping(
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device=get_device(),
+                                        device="cuda",
                                         dtype=torch.long)
     prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
@@ -132,9 +131,9 @@ def convert_mapping(
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device=get_device())
+    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
     prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device=get_device(),
+                                         device="cuda",
                                          dtype=torch.long)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
@@ -146,9 +145,8 @@ def convert_mapping(
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device=get_device(),
-        dtype=torch.long) + (sampler_indices_padded *
-                             len(sampler_indices_padded))
+        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
     if long_lora_context:

From 97c398efdf497c9bd58cd77d45a10134f875c6f1 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:49:44 +0300
Subject: [PATCH 231/341] prune unnecessary docs

---
 docs/source/getting_started/quickstart.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 71f4e4a1b6656..80b19ac672936 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -9,7 +9,7 @@ This guide shows how to use vLLM to:
 * build an API server for a large language model;
 * start an OpenAI-compatible API server.
 
-Be sure to complete the `Gaudi installation instructions <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/gaudi-installation.rst#run-docker-image>`_ before continuing with this guide.
+Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
 
 .. note::
 

From 6a913b3a90426b41c9fad5ffbb478decfb374d96 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:50:17 +0300
Subject: [PATCH 232/341] revert requirements-build.txt changes

---
 requirements-build.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 6ec80356fdbca..3f08f5d67b6da 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,8 +1,8 @@
-# Should be mirrored in pyproject.toml
-cmake>=3.26
-ninja
-packaging
-setuptools>=49.4.0
-torch==2.4.0
-wheel
-jinja2
+# Should be mirrored in pyproject.toml
+cmake>=3.26
+ninja
+packaging
+setuptools>=49.4.0
+torch==2.4.0
+wheel
+jinja2

From c64dc8359b717b9ceed84ee9feb65f1fb739ab60 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 17:37:59 +0200
Subject: [PATCH 233/341] Move profilers to vllm-hpu-extension (#323)

Continuation of https://github.com/HabanaAI/vllm-hpu-extension/pull/4

I've also removed is_tpu, as it got mistakenly restored in the rebase.
It's not in the upstream.
---
 requirements-hpu.txt               |   2 +-
 vllm/executor/habana_executor.py   |   6 +-
 vllm/utils.py                      | 104 ------------------------
 vllm/worker/habana_model_runner.py |  10 +--
 vllm/worker/habana_worker.py       |   4 +-
 vllm/worker/profiler.py            | 126 -----------------------------
 6 files changed, 12 insertions(+), 240 deletions(-)
 delete mode 100644 vllm/worker/profiler.py

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 56caa4ba03862..1ab81898b5f7e 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,4 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 6e92da0245836..44226fc898218 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -6,14 +6,16 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
+from vllm_hpu_extension.profiler import HabanaMemoryProfiler
+
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
diff --git a/vllm/utils.py b/vllm/utils.py
index f7e7a64619b1f..e5cef9b4419c0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -374,15 +374,6 @@ def _is_built_for_hpu() -> bool:
         return False
 
 
-@lru_cache(maxsize=None)
-def is_tpu() -> bool:
-    try:
-        import libtpu
-    except ImportError:
-        libtpu = None
-    return libtpu is not None
-
-
 @lru_cache(maxsize=None)
 def is_xpu() -> bool:
     from importlib.metadata import PackageNotFoundError, version
@@ -785,107 +776,12 @@ def print_warning_once(msg: str) -> None:
     logger.warning(msg)
 
 
-# Adapted from https://stackoverflow.com/a/49361727
-def format_bytes(size):
-    # 2**10 = 1024
-    power = 2**10
-    n = 0
-    power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
-    while abs(size) > power:
-        size /= power
-        n += 1
-    return f'{size:.4g} {power_labels[n]+"B"}'
-
-
 def get_device() -> str:
     if is_hpu():
         return "hpu"
     return "cuda"
 
 
-class HabanaMemoryProfiler:
-
-    def __init__(self, device=None):
-        self.device = device
-
-    @staticmethod
-    def current_device_memory_usage() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
-        return total_hpu_memory - free_hpu_memory
-
-    @staticmethod
-    def current_free_device_memory() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        free_hpu_memory, _ = torch.hpu.mem_get_info()
-        return free_hpu_memory
-
-    @staticmethod
-    def total_device_memory() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        _, total_hpu_memory = torch.hpu.mem_get_info()
-        return total_hpu_memory
-
-    @staticmethod
-    def current_host_memory_usage() -> float:
-        # Return the host memory usage in bytes.
-        return HabanaMemoryProfiler.total_host_memory(
-        ) - HabanaMemoryProfiler.current_free_host_memory()
-
-    @staticmethod
-    def current_free_host_memory() -> float:
-        # Return the host memory usage in bytes.
-        return psutil.virtual_memory().available
-
-    @staticmethod
-    def total_host_memory() -> float:
-        # Return the host memory usage in bytes.
-        return psutil.virtual_memory().total
-
-    def get_summary_string(self):
-        if getattr(self, 'final_device_memory', None) is None or getattr(
-                self, 'final_host_memory', None) is None:
-            raise RuntimeError(
-                "HabanaMemoryProfiler.get_summary_string() can only be called "
-                "after closing context manager")
-        return (
-            f"{format_bytes(self.consumed_device_memory)} of device memory "
-            f"({format_bytes(self.final_device_memory)}/"
-            f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
-            f" and {format_bytes(self.consumed_host_memory)} of host memory "
-            f"({format_bytes(self.final_host_memory)}/"
-            f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
-
-    def __enter__(self):
-        # Force garbage collection
-        gc.collect()
-        self.initial_device_memory = \
-            HabanaMemoryProfiler.current_device_memory_usage()
-        self.initial_host_memory = \
-            HabanaMemoryProfiler.current_host_memory_usage()
-        # This allows us to call methods of the context manager if needed
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        # Force garbage collection
-        gc.collect()
-        self.final_device_memory = \
-            HabanaMemoryProfiler.current_device_memory_usage(
-        )
-        self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
-        )
-        self.consumed_device_memory = \
-            self.final_device_memory - self.initial_device_memory
-        self.consumed_host_memory = \
-            self.final_host_memory - self.initial_host_memory
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6cd80eb15107e..c99500ef1296b 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -21,6 +21,8 @@
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
 from vllm_hpu_extension.ops import LoraMask as LoraMask
+from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
+                                         HabanaMemoryProfiler, format_bytes)
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
@@ -39,8 +41,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu,
-                        is_pin_memory_available, make_tensor_with_pad)
+from vllm.utils import (is_fake_hpu, is_pin_memory_available,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
@@ -48,8 +50,6 @@
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
 
-from .profiler import Profiler
-
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -517,7 +517,7 @@ def __init__(
         self.prompt_adapter_config = prompt_adapter_config
         self.return_hidden_states = return_hidden_states
         self.observability_config = observability_config
-        self.profiler = Profiler()
+        self.profiler = HabanaHighLevelProfiler()
 
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index f2678c5e405dc..8cdbba02fbb33 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -9,6 +9,7 @@
 import habana_frameworks.torch as htorch  # noqa:F401
 import torch
 import torch.distributed
+from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
@@ -21,8 +22,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string,
-                        hpu_device_string, is_fake_hpu)
+from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py
deleted file mode 100644
index 48348de41f520..0000000000000
--- a/vllm/worker/profiler.py
+++ /dev/null
@@ -1,126 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import json
-import os
-import queue
-import threading
-import time
-from contextlib import contextmanager
-from typing import Any, List
-
-from vllm.logger import init_logger
-from vllm.utils import get_vllm_instance_id
-
-logger = init_logger(__name__)
-
-
-class FileWriter(threading.Thread):
-
-    def __init__(self, filename, event_queue):
-        super().__init__()
-        self.filename = filename
-        self.event_queue = event_queue
-        self.daemon = True
-        self.timer_event = threading.Event()
-
-    def _drain_event_queue(self):
-        content = ''
-        while True:
-            try:
-                element = self.event_queue.get_nowait()
-                content += element
-            except queue.Empty:
-                break
-        return content
-
-    def run(self):
-        # don't check the queue too often
-        while not self.timer_event.wait(1):
-            # Block and wait for the next item in the queue
-            content = self.event_queue.get()
-            # Collect any other items in the queue
-            content += self._drain_event_queue()
-
-            with open(self.filename, 'a') as outfile:
-                outfile.write(content)
-
-
-class Profiler:
-    profiling_trace_events: queue.Queue = queue.Queue()
-    event_tid = {'counter': 1, 'external': 2, 'internal': 3}
-    vllm_instance_id = get_vllm_instance_id()
-    filename = f'server_events_{vllm_instance_id}.json'
-    event_cache: List[Any] = []
-
-    def __init__(self):
-        self.enabled = os.getenv('VLLM_PROFILER_ENABLED',
-                                 'false').lower() == 'true' and int(
-                                     os.getenv('RANK', '0')) == 0
-        msg = f'Profiler enabled for: {self.vllm_instance_id}'
-        logger.info(msg)
-        if self.enabled:
-            # initialize the trace file (JSON Array Format)
-            with open(self.filename, 'w') as outfile:
-                outfile.write('[')
-            file_writer = FileWriter(self.filename,
-                                     self.profiling_trace_events)
-            file_writer.start()
-
-    def _dump_with_sep(self, entry):
-        entry = json.dumps(entry) + ','
-        self.profiling_trace_events.put(entry)
-
-    def get_timestamp_us(self):
-        return time.time() * 1000000.0
-
-    def record_counter(self, ts, counter):
-        if self.enabled:
-            self._dump_with_sep({
-                'pid': 1,
-                'tid': self.event_tid['counter'],
-                'ph': 'C',
-                'name': 'utils',
-                'ts': ts,
-                'args': counter
-            })
-
-    def start(self, type, name, args=None):
-        if self.enabled:
-            ts = self.get_timestamp_us()
-            if args is not None and 'counter' in args:
-                self.record_counter(ts, args['counter'])
-                del args['counter']
-            event = {
-                'pid': 1,
-                'tid': self.event_tid[type],
-                'ph': 'X',
-                'name': name,
-                'ts': ts,
-                'dur': None,
-                'args': args
-            }
-            self.event_cache.append(event)
-
-    def end(self):
-        if self.enabled:
-            ts = self.get_timestamp_us()
-            if not self.event_cache:
-                logger.warning(
-                    'Profiler: end() call does not have matching start() call. '
-                    'Disabling profiler.')
-                self.enabled = False
-                return
-            event = self.event_cache.pop()
-            event['dur'] = ts - event['ts']
-            self._dump_with_sep(event)
-
-    @contextmanager
-    def record_event(self, type, name, args=None):
-        if self.enabled:
-            self.start(type, name, args)
-            yield
-            self.end()
-        else:
-            yield

From c562b0274e447023475e654533ea8d5cb044ffe2 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 18:44:43 +0300
Subject: [PATCH 234/341] Revert "Add fake HPU mode to Habana components with
 dummy habana_frameworks module. (#250)"

This reverts commit a9de5ba2385d5c332a2610a055465234905ff334.
---
 vllm/__init__.py                           |  4 ----
 vllm/executor/ray_habana_executor.py       | 12 +++++-------
 vllm/model_executor/model_loader/loader.py |  7 ++-----
 vllm/model_executor/models/opt.py          |  1 +
 vllm/utils.py                              | 22 ----------------------
 vllm/worker/cache_engine.py                |  4 ++--
 vllm/worker/habana_model_runner.py         | 11 ++++-------
 vllm/worker/habana_worker.py               | 16 +++-------------
 8 files changed, 17 insertions(+), 60 deletions(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 0c4cd03210898..59af68fb493e5 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,8 +1,4 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
-from vllm.utils import is_fake_hpu, migrate_to_cpu
-
-if is_fake_hpu():
-    migrate_to_cpu()
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
index 645bceb1af446..15294e78824d0 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_habana_executor.py
@@ -17,7 +17,7 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                         get_ip, get_open_port, get_vllm_instance_id,
-                        is_fake_hpu, make_async)
+                        make_async)
 from vllm.worker.worker_base import WorkerBase
 
 if ray is not None:
@@ -127,20 +127,18 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         driver_ip = get_ip()
         worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            resource_name = "HPU" if not is_fake_hpu() else "CPU"
-            if not bundle.get(resource_name, 0):
+            if not bundle.get("HPU", 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
                 placement_group_bundle_index=bundle_id,
             )
-            resources = {'HPU': num_gpus} if not is_fake_hpu() else {}
-            num_cpus = 0 if not is_fake_hpu() else num_gpus
+
             worker = ray.remote(
-                num_cpus=num_cpus,
+                num_cpus=0,
                 num_gpus=0,
-                resources=resources,
+                resources={'HPU': num_gpus},
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b03e6aca48c0e..98c9826a3e3fb 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -44,7 +44,7 @@
                                                    supports_multimodal)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_fake_hpu, is_pin_memory_available
+from vllm.utils import is_pin_memory_available
 
 
 @contextmanager
@@ -356,10 +356,7 @@ def load_model(self, *, model_config: ModelConfig,
                    cache_config: CacheConfig) -> nn.Module:
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            _device = torch.device(
-                device_config.device) if is_fake_hpu() else torch.device(
-                    self.load_config.device)
-            with _device:
+            with torch.device(self.load_config.device):
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 47ec718a43420..88d2bcb9f0c9d 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -254,6 +254,7 @@ def forward(
         if self.project_in is not None:
             inputs_embeds, _ = self.project_in(inputs_embeds)
         hidden_states = inputs_embeds + pos_embeds
+
         for i in range(len(self.layers)):
             layer = self.layers[i]
             hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
diff --git a/vllm/utils.py b/vllm/utils.py
index e5cef9b4419c0..daeed88114c47 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -339,28 +339,6 @@ def is_neuron() -> bool:
 
 @lru_cache(maxsize=None)
 def is_hpu() -> bool:
-    return _is_habana_frameworks_installed() or _is_built_for_hpu()
-
-
-@lru_cache(maxsize=None)
-def is_fake_hpu() -> bool:
-    return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'
-
-
-@lru_cache(maxsize=None)
-def hpu_device_string():
-    device_string = 'hpu' if not is_fake_hpu() else 'cpu'
-    return device_string
-
-
-@lru_cache(maxsize=None)
-def hpu_backend_string():
-    backend_string = 'hccl' if not is_fake_hpu() else 'gloo'
-    return backend_string
-
-
-@lru_cache(maxsize=None)
-def _is_habana_frameworks_installed() -> bool:
     from importlib import util
     return util.find_spec('habana_frameworks') is not None
 
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index f678d44f71dd3..ec0b8c2369210 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu,
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available)
 
 logger = init_logger(__name__)
@@ -78,7 +78,7 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
         for _ in range(self.num_attention_layers):
-            if device == 'hpu' or is_fake_hpu():
+            if device == 'hpu':
                 key_cache = torch.zeros(kv_cache_shape,
                                         dtype=self.dtype,
                                         device=device)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index c99500ef1296b..f9dd1597f6615 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -41,7 +41,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (is_fake_hpu, is_pin_memory_available,
+from vllm.utils import (is_pin_memory_available,
                         make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
@@ -253,8 +253,7 @@ def __init__(self, model, block_size, dtype, enforce_eager):
                                                '0').lower() in ['1', 'true']
         self.block_size = block_size
         self.dtype = dtype
-        if not is_fake_hpu() and not htorch.utils.internal.is_lazy(
-        ) and not enforce_eager:
+        if not htorch.utils.internal.is_lazy() and not enforce_eager:
             self.model = torch.compile(self.model,
                                        backend='hpu_backend',
                                        dynamic=False)
@@ -523,9 +522,7 @@ def __init__(
                                if model_config is not None else None)
         self.device_config = (device_config
                               if device_config is not None else DeviceConfig())
-        if is_fake_hpu():
-            device_config.device = torch.device('cpu')
-            device_config.device_type = 'cpu'
+
         self.device = self.device_config.device
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
@@ -634,7 +631,7 @@ def load_model(self) -> None:
                                           mark_only_scales_as_const=True)
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())
-            elif not is_fake_hpu():
+            else:
                 self.model = self.model.to("hpu")
                 htcore.mark_step()
             torch.hpu.synchronize()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 8cdbba02fbb33..55e9b7bf7e744 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -22,7 +22,6 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
@@ -110,8 +109,6 @@ def init_device(self) -> None:
         if self.device_config.device.type == "hpu":
             self.device = torch.device("hpu")
             torch.hpu.set_device(self.device)
-        elif self.device_config.device_type == "cpu":
-            self.device = torch.device("cpu")
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -145,10 +142,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        if is_fake_hpu():
-            cache_block_size = self.get_cache_block_size_bytes()
-            fake_hpu_cache_alloc = 4 * 2**30  # take 4 GiB flat on fake hpu
-            return fake_hpu_cache_alloc // cache_block_size, 0
         with HabanaMemoryProfiler() as m:
             self.model_runner.profile_run()
             torch.hpu.synchronize()
@@ -346,12 +339,11 @@ def init_worker_distributed_environment(
     local_rank: int = -1,
 ) -> None:
     """Initialize the distributed environment."""
-    backend = hpu_backend_string()
     init_distributed_environment(parallel_config.world_size,
                                  rank,
                                  distributed_init_method,
                                  local_rank,
-                                 backend=backend)
+                                 backend='hccl')
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                       parallel_config.pipeline_parallel_size)
@@ -368,17 +360,15 @@ def init_worker_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
-        backend = hpu_backend_string()
         torch.distributed.init_process_group(
-            backend=backend,
+            backend="hccl",
             world_size=parallel_config.world_size,
             rank=rank,
             init_method=distributed_init_method,
         )
 
     # A small all_reduce for warmup & checking conformance.
-    device = hpu_device_string()
-    dummy_tensor_hpu = torch.ones(1).to(device)
+    dummy_tensor_hpu = torch.ones(1).to('hpu')
     torch.distributed.all_reduce(dummy_tensor_hpu)
     assert dummy_tensor_hpu.item() == parallel_config.world_size
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,

From cf3bbd2c19a524d824c4eb1d89d2520294715384 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 18:47:10 +0300
Subject: [PATCH 235/341] fix revert

---
 vllm/executor/ray_utils.py         | 4 ++--
 vllm/worker/habana_model_runner.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 8971f5aac626e..45b40dee884ab 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -10,7 +10,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import get_ip, hpu_device_string, is_hip, is_xpu
+from vllm.utils import get_ip, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -241,7 +241,7 @@ def initialize_ray_cluster(
     if current_platform.is_tpu():
         device_str = "TPU"
     elif current_platform.is_hpu():
-        device_str = hpu_device_string().upper()
+        device_str = 'HPU'
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index f9dd1597f6615..5eb0a3f35b393 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -41,8 +41,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (is_pin_memory_available,
-                        make_tensor_with_pad)
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,

From 09357b427287ad17795291e11a52a5f743035f2a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 18:54:03 +0300
Subject: [PATCH 236/341] Revert "Initial commit"

This reverts commit 2ab316db5f9f5f2944cbac68132769411e4833de.
---
 tests/samplers/test_sampler.py        |  25 ------
 vllm/model_executor/layers/sampler.py | 112 +-------------------------
 2 files changed, 2 insertions(+), 135 deletions(-)
 mode change 100755 => 100644 vllm/model_executor/layers/sampler.py

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 308b708feab71..49bbc206d3daa 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -743,28 +743,3 @@ def test_sampling_params(sampling_params: List[SamplingParams]):
 
     assert tokens1[0] == tokens2[1]
     assert tokens1[1] == tokens2[0]
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_include_gpu_probs_tensor(device: str):
-    set_random_seed(42)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-    sampler.include_gpu_probs_tensor = True
-    sampler.should_modify_greedy_probs_inplace = False
-
-    sampling_params = SamplingParams(temperature=0)
-
-    mock_inplace = Mock()
-    with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
-        mock_inplace.assert_not_called()
-
-    assert sampler_output.sampled_token_probs is not None
-    assert sampler_output.logprobs is not None
-    assert sampler_output.sampled_token_ids is not None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
old mode 100755
new mode 100644
index 6da6199a01962..2ca86a4653cf4
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,6 +1,5 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
-import math
 import warnings
 from dataclasses import dataclass
 from importlib.util import find_spec
@@ -199,13 +198,6 @@ def _init_sampling_tensors(
         self._do_penalties = do_penalties
         self._do_top_p_top_k = do_top_p_top_k
         self._do_min_p = do_min_p
-        self._top_p_scalar = sampling_tensors.top_ps[0].item()
-        self._top_k_scalar = sampling_tensors.top_ks[0].item()
-        scalar_p = torch.all(sampling_tensors.top_ps == self._top_p_scalar)
-        scalar_k = torch.all(sampling_tensors.top_ks == self._top_k_scalar)
-        self._scalar_p_and_k = (scalar_p and scalar_k).item()
-        if self._scalar_p_and_k and self._do_top_p_top_k:
-            self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
 
     def forward(
         self,
@@ -265,13 +257,8 @@ def forward(
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
         if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
-            if self._scalar_p_and_k:
-                logits = self._apply_top_k_top_p_opt(logits,
-                                                     self._top_p_scalar,
-                                                     self._top_k_scalar)
-            else:
-                logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
-                                            sampling_tensors.top_ks)
+            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                        sampling_tensors.top_ks)
 
         if do_min_p:
             logits = _apply_min_p(logits, sampling_tensors.min_ps)
@@ -353,101 +340,6 @@ def _get_bin_counts_and_mask(
     return bin_counts, mask
 
 
-class ApplyToppTopkScalar():
-    """
-    The original implementation of _apply_top_k_top_p is more general
-    as it uses vector topp, topk
-    However in a lot of cases, topp and topk is same for all batch elements
-    For such "scalar" topp, topk cases, we can use this class
-
-    The main optimizations in this class is:
-    Use topk instead of sort, which is much faster especially for small k.
-    However just using topk might not suffice in cases as shown below
-    Consider a tensor: 9 9 8 8 8 8 7 7 7
-    Topk, with k=5, on this yields 9 9 8 8 8
-    The value "8" is on the boundary, hence the last "8" gets snipped off
-    However the original implementation accepts all the "8"s,
-    so it should output:
-    9 9 8 8 8 8 (6 values, even though k=5)
-    To ensure these semantics, we perform topk with _padded_k elements
-    If we find more boundary elements left over,
-    then we keep incrementing _padded_k
-    and in future calls use the expanded value of __padded_k
-
-    The increments to _padded_k should be done
-    with value > 1 to prevent excessive recompilations
-    due to dynamic shapes (the output shape of the topk)
-
-    The main logic of this is in __call__
-    This is a class instead of a function, just to keep track of
-    the monotonic non-decreasing state _padded_k
-    """
-    _padded_k = 0
-
-    def __init__(self, increment: int):
-        self._increment = increment
-
-    def __call__(self, logits: torch.Tensor, p: float, k: int):
-        if k > ApplyToppTopkScalar._padded_k:
-            ApplyToppTopkScalar._padded_k = min(k + self._increment,
-                                                logits.shape[1])
-
-        vals, idx = torch.topk(logits, k=ApplyToppTopkScalar._padded_k, \
-                    dim=1, sorted=True)
-
-        # this "if" checks if we have bucketed so much that
-        # we have padded k upto shape of logits
-        if ApplyToppTopkScalar._padded_k != logits.shape[1]:
-            smallest_of_top_k = vals[:, k - 1]
-            num_duplicates_of_smallest_of_topk = torch.sum(
-                logits == smallest_of_top_k.unsqueeze(1), 1)
-            max_num_duplicates_of_smallest_of_topk = torch.max(
-                num_duplicates_of_smallest_of_topk).item()
-
-            # there are n repeats for a border
-            # (border meaning the smallest value of the top k).
-            # we do not know if only 1 or 2 or (n-1)
-            # of them lie outside the kth border,
-            # so we choose to conservatively increase by n-1
-            # when num_duplicates > _padded_k - k
-            if max_num_duplicates_of_smallest_of_topk - 1 > (
-                    ApplyToppTopkScalar._padded_k - k):
-                incr = int(
-                    math.ceil((max_num_duplicates_of_smallest_of_topk - 1) /
-                              self._increment) * self._increment)
-                # this while loop should be traversed at most twice,
-                # because we dont increment by self._increment and retry
-                # instead we compute incr in one go
-                ApplyToppTopkScalar._padded_k = min(
-                    ApplyToppTopkScalar._padded_k + incr, logits.shape[1])
-
-                # recompute topk with expanded padded_k
-                vals, idx = torch.topk(logits, \
-                            k=ApplyToppTopkScalar._padded_k, \
-                            dim=1, sorted=True)
-
-        idx = torch.fliplr(idx)
-        vals = torch.fliplr(vals)
-
-        top_k_smallest_val_idx = vals.size(1) - k
-        top_k_mask = vals[:, top_k_smallest_val_idx].unsqueeze(1)
-        top_k_mask = vals < top_k_mask
-        vals.masked_fill_(top_k_mask, -float("inf"))
-
-        probs_sort = vals.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
-        top_p_mask = probs_sum <= (1 - p)
-        top_p_mask[:, -1] = False
-        vals.masked_fill_(top_p_mask, -float("inf"))
-
-        new_logits = torch.full(logits.shape,
-                                -float("inf"),
-                                device=logits.device)
-        new_logits.scatter_(1, idx, vals.to(new_logits.dtype))
-
-        return new_logits
-
-
 def _apply_min_tokens_penalty(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,

From 3713da8dfa2d4ac05951c5821ccc1fe2f689ff27 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 19:31:50 +0300
Subject: [PATCH 237/341] cleanup

---
 tests/samplers/test_sampler.py | 25 +++++++++++++++++++++++++
 vllm/engine/llm_engine.py      |  3 ---
 vllm/entrypoints/llm.py        |  3 ---
 vllm/platforms/__init__.py     |  3 +--
 vllm/utils.py                  | 26 --------------------------
 5 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 49bbc206d3daa..308b708feab71 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -743,3 +743,28 @@ def test_sampling_params(sampling_params: List[SamplingParams]):
 
     assert tokens1[0] == tokens2[1]
     assert tokens1[1] == tokens2[0]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_include_gpu_probs_tensor(device: str):
+    set_random_seed(42)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+    sampler.include_gpu_probs_tensor = True
+    sampler.should_modify_greedy_probs_inplace = False
+
+    sampling_params = SamplingParams(temperature=0)
+
+    mock_inplace = Mock()
+    with patch(
+            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
+            mock_inplace):
+
+        sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                    sampling_params, device)
+        mock_inplace.assert_not_called()
+
+    assert sampler_output.sampled_token_probs is not None
+    assert sampler_output.logprobs is not None
+    assert sampler_output.sampled_token_ids is not None
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8030dca8c3fd5..92d638a8653ed 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1108,9 +1108,6 @@ def _advance_to_next_step(
                 seq = seq_group.seqs[0]
                 seq.append_token_id(sample.output_token, sample.logprobs)
 
-    def finish_measurements(self):
-        self.model_executor.finish_measurements()
-
     def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c8c231f97cd9c..a86c51d23b34d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -196,9 +196,6 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         else:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
-    def finish_measurements(self):
-        self.llm_engine.finish_measurements()
-
     @overload  # LEGACY: single (prompt + optional token ids)
     def generate(
         self,
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index e3b7dd3bb216e..eabcf4cec41bb 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -46,8 +46,7 @@
 try:
     import os
     from importlib import util
-    is_hpu = util.find_spec('habana_frameworks') is not None or os.environ.get(
-        'VLLM_USE_FAKE_HPU', '0') != '0'
+    is_hpu = util.find_spec('habana_frameworks') is not None
 
 except Exception:
     pass
diff --git a/vllm/utils.py b/vllm/utils.py
index daeed88114c47..d2e8b7630a2f9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1347,29 +1347,3 @@ def dec(self, num=1):
     @property
     def value(self):
         return self._value
-
-
-def migrate_to_cpu():
-    import importlib
-    from unittest.mock import MagicMock
-
-    torch.hpu = MagicMock(name="torch.hpu")
-
-    # Adding dummy submodules to habana_frameworks.torch for cpu-test,
-    # functions from dummy modules will do nothing by default
-    spec = importlib.util.spec_from_loader('habana_frameworks', loader=None)
-    sys.modules['habana_frameworks'] = MagicMock()
-    sys.modules['habana_frameworks'].__spec__ = spec
-
-    builtin_import = __builtins__['__import__']  # type: ignore
-
-    def import_wrapper(name, *args, **kwargs):
-        if 'habana_frameworks' in name:
-            sys.modules[name] = MagicMock()
-        return builtin_import(name, *args, **kwargs)
-
-    __builtins__['__import__'] = import_wrapper
-
-    # In case you want to mock a function to actually do something
-    import habana_frameworks.torch as htorch
-    htorch.utils.internal.is_lazy.return_value = False

From bb6564a7a895357781558537230402070dbb0527 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 23 Sep 2024 19:34:11 +0300
Subject: [PATCH 238/341] remove redundant import

---
 vllm/platforms/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index eabcf4cec41bb..d3620259b7d4e 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -44,7 +44,6 @@
 
 is_hpu = False
 try:
-    import os
     from importlib import util
     is_hpu = util.find_spec('habana_frameworks') is not None
 

From c9683205eda7a9d850e373be9fc495d7bc39e6ba Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 10:32:42 +0200
Subject: [PATCH 239/341] Restore upstream requirements-build.txt  (#324)

At some point, someone added whitespaces to each entry in
requirements-build.txt. Upstream does not contain it. Easy fix.
---
 requirements-build.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 6ec80356fdbca..3f08f5d67b6da 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,8 +1,8 @@
-# Should be mirrored in pyproject.toml
-cmake>=3.26
-ninja
-packaging
-setuptools>=49.4.0
-torch==2.4.0
-wheel
-jinja2
+# Should be mirrored in pyproject.toml
+cmake>=3.26
+ninja
+packaging
+setuptools>=49.4.0
+torch==2.4.0
+wheel
+jinja2

From 58d5cde67662fd48bbc8d0f229991f7e79eeffb7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 10:32:53 +0200
Subject: [PATCH 240/341] Remove reminder_comment.yml workflow (#325)

This workflow never worked properly in the fork. This PR removes it.
---
 .github/workflows/reminder_comment.yml | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 .github/workflows/reminder_comment.yml

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
deleted file mode 100644
index 99827756d2066..0000000000000
--- a/.github/workflows/reminder_comment.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: PR Reminder Comment Bot
-on:
-  pull_request_target:
-    types: [opened]
-
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
-            })
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From cf4c3e5d117a44472137fc08f04b90b64eba7bab Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 10:33:03 +0200
Subject: [PATCH 241/341] Don't throw "Failed to import from vllm._C" warning
 on HPU (#326)

---
 vllm/_custom_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 678700055c992..a41b0e40b11d6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -11,7 +11,7 @@
 
 logger = init_logger(__name__)
 
-if not current_platform.is_tpu():
+if not current_platform.is_tpu() and not current_platform.is_hpu():
     try:
         import vllm._C
     except ImportError as e:

From f6ff4a759922b27f9f22527d8a1295c433700dcb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 11:37:47 +0300
Subject: [PATCH 242/341]  restore reminder_comment.yml

---
 .github/workflows/reminder_comment.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/reminder_comment.yml

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
new file mode 100644
index 0000000000000..99827756d2066
--- /dev/null
+++ b/.github/workflows/reminder_comment.yml
@@ -0,0 +1,21 @@
+name: PR Reminder Comment Bot
+on:
+  pull_request_target:
+    types: [opened]
+
+jobs:
+  pr_reminder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remind to run full CI on PR
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+            })
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From a000e628b1fc8379b072bd141c2a79a7c7c2dac1 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 11:44:11 +0300
Subject: [PATCH 243/341] Revert "[Doc][BugFix] Update setup instructions and
 reference links (#191)"

This reverts commit 8185d760325a7699c5c07f7cd0e28d443a36051b.
---
 .../getting_started/gaudi-installation.rst      | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 4c094eaec842a..c5a90ea41abcd 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -30,7 +30,7 @@ To verify that the Intel Gaudi software was correctly installed, run:
    $ pip list | grep neural # verify that neural_compressor is installed
 
 Refer to `Intel Gaudi Software Stack
-Verification <https://docs.habana.ai/en/latest/Installation_Guide/Platform_Upgrade_and_Unboxing.html#system-verifications-and-final-tests>`__
+Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade>`__
 for more details.
 
 Run Docker Image
@@ -51,6 +51,15 @@ Use the following commands to run a Docker image:
 Build and Install vLLM
 ---------------------------
 
+To build and install vLLM from source, run:
+
+.. code:: console
+
+   $ git clone https://github.com/vllm-project/vllm.git
+   $ cd vllm
+   $ python setup.py develop
+
+
 Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork <https://github.com/HabanaAI/vllm-fork>`__, run the following:
 
 .. code:: console
@@ -58,16 +67,16 @@ Currently, the latest features and performance optimizations are developed in Ga
    $ git clone https://github.com/HabanaAI/vllm-fork.git
    $ cd vllm-fork
    $ git checkout habana_main
-   $ pip install -e .
+   $ python setup.py develop
 
 
 Supported Features
 ==================
 
 -  `Offline batched
-   inference <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference>`__
+   inference <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference>`__
 -  Online inference via `OpenAI-Compatible
-   Server <https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server>`__
+   Server <https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server>`__
 -  HPU autodetection - no need to manually select device within vLLM
 -  Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 -  Custom Intel Gaudi implementations of Paged Attention, KV cache ops,

From 41217cfacfd949912fbe0eda6196d3e44236433e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 11:05:25 +0200
Subject: [PATCH 244/341] Fix doc build warnings (#330)

This PR fixes all the little warnings gaudi-installation.rst introduces
during documentation build ("WARNING: Title underline too short." etc.)
---
 docs/source/getting_started/gaudi-installation.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 4c094eaec842a..8c4905e2a488a 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -129,10 +129,10 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
    with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
 
 Performance Tuning
-================
+==================
 
 Execution modes
-------------
+---------------
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.  
 
@@ -161,7 +161,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 
 
 Bucketing mechanism
-------------
+-------------------
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler <https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime>`__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. 
@@ -234,7 +234,7 @@ This example uses the same buckets as in *Bucketing mechanism* section. Each out
    Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
 
 HPU Graph capture
-------------
+-----------------
 
 `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
 
@@ -298,7 +298,7 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 
 Recommended vLLM Parameters
-------------
+---------------------------
 
 -  We recommend running inference on Gaudi 2 with ``block_size`` of 128
    for BF16 data type. Using default values (16, 32) might lead to
@@ -310,7 +310,7 @@ Recommended vLLM Parameters
    If you encounter out-of-memory issues, see troubleshooting section.
 
 Environment variables
-------------
+---------------------
 
 **Diagnostic and profiling knobs:**
 

From 4eb9809a4d80aab8168cea2cc9906558abd486c3 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 24 Sep 2024 17:08:40 +0800
Subject: [PATCH 245/341] fix qwen2 model issue (#329)

FILL IN THE PR DESCRIPTION HERE
typo: `platform` -> `platforms`

FIX #xxxx (*link existing issues this PR will resolve*)

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to
be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a
href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom
C++ and CUDA Operators</a> and <a
href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The
Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require
meta-functions. Meta-functions should be implemented and registered in
python so that dynamic dims can be handled automatically. See above
documents for a description of meta-functions.</li>
<li>Use <a
href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a>
to test the function registration and meta-function for any registered
ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must
be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a
href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom
Class Support in PT2</a>.
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>
---
 vllm/model_executor/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 1e6fb4ad1ca7c..7fd90b2e8b282 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -46,7 +46,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platform import current_platform
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA

From 20c87dd0278af941bac33063cdab36fec00548ff Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 12:15:27 +0300
Subject: [PATCH 246/341] update docs

---
 docs/source/index.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index b4cd28608d3f0..37afbf23b73cc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,8 +43,7 @@ vLLM is flexible and easy to use with:
 * Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
-* (Experimental) Support for Intel® Gaudi® 2 accelerators
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and Gaudi® accelerators, GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 * Prefix caching support
 * Multi-lora support
 

From 9be37a356ada10dfea80e58613f710b19877c487 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 11:46:00 +0200
Subject: [PATCH 247/341] Remove vllm.utils.is_hpu() (#331)

vllm.utils.is_hpu() was redundant for some time now and has always been
problematic particularly for torch.compile mode. Now, we're fully
switching to current_platform.is_hpu().
---
 requirements-hpu.txt                       |  2 +-
 vllm/core/block/cpu_gpu_block_allocator.py |  5 +++--
 vllm/core/block_manager_v1.py              |  5 +++--
 vllm/utils.py                              | 24 ++--------------------
 4 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 1ab81898b5f7e..c7376a7c504f9 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,4 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index dd99dd94e4ad0..422c1f4bd8f8b 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,7 +4,8 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device, is_hpu
+from vllm.platforms import current_platform
+from vllm.utils import Device
 
 
 class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -53,7 +54,7 @@ def create(
                 before CPU block IDs.
         """
         # For HPU, block id 0 is used only for padding
-        reserved_blocks = 1 if is_hpu() else 0
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
         block_ids = list(
             range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
         num_gpu_blocks -= reserved_blocks
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 8e7335a4016e9..b1160e8d2f163 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -13,8 +13,9 @@
 from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device, is_hpu
+from vllm.utils import Device
 
 logger = init_logger(__name__)
 
@@ -185,7 +186,7 @@ def __init__(
         # Initialize the free blocks.
         self.free_blocks: List[PhysicalTokenBlock] = []
         # For HPU, block id 0 is used only for padding
-        reserved_blocks = 1 if is_hpu() else 0
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
         for i in range(reserved_blocks, num_blocks):
             block = PhysicalTokenBlock(device=device,
                                        block_number=i,
diff --git a/vllm/utils.py b/vllm/utils.py
index e5cef9b4419c0..ca36ad8cd9592 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -337,11 +337,6 @@ def is_neuron() -> bool:
     return transformers_neuronx is not None
 
 
-@lru_cache(maxsize=None)
-def is_hpu() -> bool:
-    return _is_habana_frameworks_installed() or _is_built_for_hpu()
-
-
 @lru_cache(maxsize=None)
 def is_fake_hpu() -> bool:
     return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0'
@@ -359,21 +354,6 @@ def hpu_backend_string():
     return backend_string
 
 
-@lru_cache(maxsize=None)
-def _is_habana_frameworks_installed() -> bool:
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
-
-@lru_cache(maxsize=None)
-def _is_built_for_hpu() -> bool:
-    from importlib.metadata import PackageNotFoundError, version
-    try:
-        return "gaudi" in version("vllm")
-    except PackageNotFoundError:
-        return False
-
-
 @lru_cache(maxsize=None)
 def is_xpu() -> bool:
     from importlib.metadata import PackageNotFoundError, version
@@ -777,7 +757,7 @@ def print_warning_once(msg: str) -> None:
 
 
 def get_device() -> str:
-    if is_hpu():
+    if current_platform.is_hpu():
         return "hpu"
     return "cuda"
 
@@ -797,7 +777,7 @@ def is_pin_memory_available() -> bool:
     elif is_neuron():
         print_warning_once("Pin memory is not supported on Neuron.")
         return False
-    elif is_hpu():
+    elif current_platform.is_hpu():
         print_warning_once("Pin memory is not supported on HPU.")
         return False
     elif is_cpu() or is_openvino():

From 874f3d80ae1ca429258dd94e6c43699640e7ea48 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 12:52:48 +0300
Subject: [PATCH 248/341] remove get_device

---
 vllm/utils.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 50ce0911707e9..3b4ec8f41ad94 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -739,12 +739,6 @@ def print_warning_once(msg: str) -> None:
     logger.warning(msg)
 
 
-def get_device() -> str:
-    if current_platform.is_hpu():
-        return "hpu"
-    return "cuda"
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
 

From e16918d41e5e82930872dbb263594f1aeb3b73a4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 12:01:31 +0200
Subject: [PATCH 249/341] Remove logger from layernorm (#332)

Upstream does not use logger in layernorm. Neither do we. No idea why
it's there.
---
 vllm/model_executor/layers/layernorm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8f1c60db3ebd7..257e6b37c7890 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,11 +4,8 @@
 import torch
 import torch.nn as nn
 
-from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 
-logger = init_logger(__name__)
-
 
 class RMSNorm(CustomOp):
     """Root mean square normalization.

From 347380f2c4fde4006de6a7f34415f726558164f3 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 14:49:36 +0300
Subject: [PATCH 250/341] Fix INC FP8 inference after rebase

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 vllm/platforms/hpu.py                      | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 98c9826a3e3fb..bb4ec190892d6 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -59,7 +59,7 @@ def device_loading_context(module: torch.nn.Module,
 
     # Store original device states and move parameters to GPU if they're on CPU
     for name, p in module.named_parameters():
-        if p.device.type == "cpu":
+        if p.device.type == "cpu" and target_device.type != 'hpu':
             original_device_states[name] = p.device
             p.data = p.data.to(target_device)
         # Parameters already on target device are not touched
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index feddce69ac5b4..170cfff94f90d 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,18 +1,11 @@
-from typing import Optional
-
 import torch
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import Platform, PlatformEnum
 
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
 
-    @staticmethod
-    def get_device_capability(
-            device_id: int = 0) -> Optional[DeviceCapability]:
-        raise RuntimeError("HPU does not have device capability.")
-
     @staticmethod
     def inference_mode():
         return torch.no_grad()

From 73f4b48905f75ebbed035ab5e7abde2b64e701c3 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 14:22:16 +0200
Subject: [PATCH 251/341] Fix INC FP8 inference after rebase (#333)

This PR fixes the "RuntimeError: HPU does not have device capability."
error introduced after rebase & fixes loading weights on CPU for
quantization.
---
 vllm/model_executor/model_loader/loader.py | 2 +-
 vllm/platforms/hpu.py                      | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b03e6aca48c0e..b3274b6d95115 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -59,7 +59,7 @@ def device_loading_context(module: torch.nn.Module,
 
     # Store original device states and move parameters to GPU if they're on CPU
     for name, p in module.named_parameters():
-        if p.device.type == "cpu":
+        if p.device.type == "cpu" and target_device.type != 'hpu':
             original_device_states[name] = p.device
             p.data = p.data.to(target_device)
         # Parameters already on target device are not touched
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index feddce69ac5b4..170cfff94f90d 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,18 +1,11 @@
-from typing import Optional
-
 import torch
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import Platform, PlatformEnum
 
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
 
-    @staticmethod
-    def get_device_capability(
-            device_id: int = 0) -> Optional[DeviceCapability]:
-        raise RuntimeError("HPU does not have device capability.")
-
     @staticmethod
     def inference_mode():
         return torch.no_grad()

From b582d7790b79f6442401bdd7286b41f308b9a4c0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 18:31:35 +0300
Subject: [PATCH 252/341] Make weights_load_device not change
 EngineArgs.create_load_config()

---
 vllm/config.py                             |  3 ++-
 vllm/engine/arg_utils.py                   | 13 +++++++------
 vllm/model_executor/model_loader/loader.py |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index e5662e901e8d9..0a5098d997936 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -751,7 +751,8 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
-        device: Device on which weights are loaded.
+        device: Device to which model weights will be loaded, default to
+            device_config.device
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5511f56c65519..14bee2dd128d5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -269,8 +269,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument("--weights-load-device",
                             type=str,
                             default=EngineArgs.weights_load_device,
-                            choices=["cuda", "neuron", "hpu", "cpu"],
-                            help='Device on which weights are loaded.')
+                            choices=DEVICE_OPTIONS,
+                            help=('Device to which model weights '
+                                  'will be loaded.'))
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
@@ -848,11 +849,11 @@ def create_model_config(self) -> ModelConfig:
             mm_processor_kwargs=self.mm_processor_kwargs,
         )
 
-    def create_load_config(self, load_device) -> LoadConfig:
+    def create_load_config(self) -> LoadConfig:
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
-            device=load_device,
+            device=self.load_device,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
         )
@@ -1037,9 +1038,9 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device if self.weights_load_device is \
+        self.load_device = device_config.device if self.weights_load_device is \
             None else self.weights_load_device
-        load_config = self.create_load_config(load_device)
+        load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(
             max_prompt_adapters=self.max_prompt_adapters,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index ba950bc5fdc25..d7f24f3977a48 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -399,7 +399,7 @@ def load_model(self, *, model_config: ModelConfig,
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)
-            logger.info("Loading weights on %s ...", self.load_config.device)
+            logger.info("Loading weights on %s...", self.load_config.device)
             model.load_weights(self._get_all_weights(model_config, model))
 
             for _, module in model.named_modules():

From b90adacf9dfac62c187d40fd1eae18e7887add72 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 18:49:00 +0300
Subject: [PATCH 253/341] More robust load device autodetection

---
 vllm/engine/arg_utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 14bee2dd128d5..93648bc7853e1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -849,11 +849,13 @@ def create_model_config(self) -> ModelConfig:
             mm_processor_kwargs=self.mm_processor_kwargs,
         )
 
-    def create_load_config(self) -> LoadConfig:
+    def create_load_config(self, load_device=None) -> LoadConfig:
+        if load_device is None:
+            load_device = DeviceConfig(device=self.device).device
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
-            device=self.load_device,
+            device=load_device,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
         )
@@ -1038,9 +1040,9 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        self.load_device = device_config.device if self.weights_load_device is \
+        load_device = device_config.device if self.weights_load_device is \
             None else self.weights_load_device
-        load_config = self.create_load_config()
+        load_config = self.create_load_config(load_device)
 
         prompt_adapter_config = PromptAdapterConfig(
             max_prompt_adapters=self.max_prompt_adapters,

From d853eebb046483393f7f7337254a946c910d8e9d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 19:06:18 +0300
Subject: [PATCH 254/341] WA for none load device

---
 vllm/model_executor/model_loader/loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d7f24f3977a48..ffb9308680837 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -395,7 +395,9 @@ def load_model(self, *, model_config: ModelConfig,
                    cache_config: CacheConfig) -> nn.Module:
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(self.load_config.device):
+            load_device = torch.device(self.load_config.device) if \
+                self.load_config.device is not None else target_device
+            with load_device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)

From 9111a8059b699344313f21a4314562d9405ec991 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 18:07:01 +0200
Subject: [PATCH 255/341] Make weights_load_device not change
 EngineArgs.create_load_config() (#336)

Some backends rely on calling EngineArgs.create_load_config() directly,
for which we've altered the API. We don't need to alter it to enable
weight load device functionality. This PR fixes it.
---
 vllm/config.py                             | 3 ++-
 vllm/engine/arg_utils.py                   | 9 ++++++---
 vllm/model_executor/model_loader/loader.py | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b8ec23e030ac9..011563038e6bb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -751,7 +751,8 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
-        device: Device on which weights are loaded.
+        device: Device to which model weights will be loaded, default to
+            device_config.device
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ffe12d4cd5fb6..84529b267ce0b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -268,8 +268,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument("--weights-load-device",
                             type=str,
                             default=EngineArgs.weights_load_device,
-                            choices=["cuda", "neuron", "hpu", "cpu"],
-                            help='Device on which weights are loaded.')
+                            choices=DEVICE_OPTIONS,
+                            help=('Device to which model weights '
+                                  'will be loaded.'))
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
@@ -843,7 +844,9 @@ def create_model_config(self) -> ModelConfig:
             mm_processor_kwargs=self.mm_processor_kwargs,
         )
 
-    def create_load_config(self, load_device) -> LoadConfig:
+    def create_load_config(self, load_device=None) -> LoadConfig:
+        if load_device is None:
+            load_device = DeviceConfig(device=self.device).device
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b3274b6d95115..fcff39f790564 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -363,7 +363,7 @@ def load_model(self, *, model_config: ModelConfig,
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)
-            logger.info("Loading weights on %s ...", self.load_config.device)
+            logger.info("Loading weights on %s...", self.load_config.device)
             model.load_weights(
                 self._get_weights_iterator(model_config.model,
                                            model_config.revision,

From db8dbce31c460cf93e3c0a268fb5628a37fa56ec Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 19:09:48 +0300
Subject: [PATCH 256/341] device type

---
 vllm/engine/arg_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 93648bc7853e1..28f4c75bc5181 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -851,7 +851,8 @@ def create_model_config(self) -> ModelConfig:
 
     def create_load_config(self, load_device=None) -> LoadConfig:
         if load_device is None:
-            load_device = DeviceConfig(device=self.device).device
+            dummy_device_config = DeviceConfig(device=self.device)
+            load_device = dummy_device_config.device_type
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
@@ -1040,7 +1041,7 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device if self.weights_load_device is \
+        load_device = device_config.device_type if self.weights_load_device is \
             None else self.weights_load_device
         load_config = self.create_load_config(load_device)
 

From c337e9347a7183eee392cf884f4bca1032f75657 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 21:42:00 +0300
Subject: [PATCH 257/341] Revert "fix guided_decode HPU failing issue"

This reverts commit 8046d81cf279828be7b4d9a0b2a242e592748302.
---
 .../guided_decoding/outlines_logits_processors.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 092c143bd59b0..c28bd71c9f682 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -80,7 +80,7 @@ def __call__(self, input_ids: List[int],
                           -math.inf,
                           device=scores.device)
         mask[allowed_tokens] = 0
-        scores = scores.add(mask)
+        scores.add_(mask)
         return scores
 
 
From e8e369f3b82956e724c74e7227032c0a90908743 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 24 Sep 2024 21:47:35 +0300
Subject: [PATCH 258/341] load device fix

---
 vllm/engine/arg_utils.py                   | 4 ++--
 vllm/model_executor/model_loader/loader.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 28f4c75bc5181..ae20fb9f27c21 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -852,7 +852,7 @@ def create_model_config(self) -> ModelConfig:
     def create_load_config(self, load_device=None) -> LoadConfig:
         if load_device is None:
             dummy_device_config = DeviceConfig(device=self.device)
-            load_device = dummy_device_config.device_type
+            load_device = dummy_device_config.device
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
@@ -1041,7 +1041,7 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device_type if self.weights_load_device is \
+        load_device = device_config.device if self.weights_load_device is \
             None else self.weights_load_device
         load_config = self.create_load_config(load_device)
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index ffb9308680837..72ad688d07905 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -395,13 +395,13 @@ def load_model(self, *, model_config: ModelConfig,
                    cache_config: CacheConfig) -> nn.Module:
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            load_device = torch.device(self.load_config.device) if \
+            load_device : torch.device = self.load_config.device if \
                 self.load_config.device is not None else target_device
             with load_device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)
-            logger.info("Loading weights on %s...", self.load_config.device)
+            logger.info("Loading weights on %s...", target_device)
             model.load_weights(self._get_all_weights(model_config, model))
 
             for _, module in model.named_modules():

From 8c6dcae75c06144081c98cb07668be87d763eb47 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 25 Sep 2024 14:13:15 +0200
Subject: [PATCH 259/341] Refine INC shutdown code (#335)

This PR removes debug printouts in INC shutdown method and covers the
case where application exits before model is initialized properly.
---
 vllm/executor/habana_executor.py   |  3 ---
 vllm/worker/habana_model_runner.py | 16 +++++++++++-----
 vllm/worker/habana_worker.py       |  3 ---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 44226fc898218..e4bd54f8849b3 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -195,9 +195,6 @@ def check_health(self) -> None:
     def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown()
-
 
 class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index c99500ef1296b..6940e7637dbb7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -550,6 +550,7 @@ def __init__(
         # Lazy initialization
         self.lora_manager: LRUCacheWorkerLoRAManager = None
         self.model: torch.nn.Module = None
+        self.inc_initialized_successfully = False
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
@@ -632,6 +633,7 @@ def load_model(self) -> None:
                         self.model = convert(self.model, config)
                     htcore.hpu_initialize(self.model,
                                           mark_only_scales_as_const=True)
+                self.inc_initialized_successfully = True
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())
             elif not is_fake_hpu():
@@ -1938,14 +1940,18 @@ def execute_model(
         return [output]
 
     def shutdown_inc(self):
-        print('inc shutdown')
-        if (model_config := getattr(self, "model_config", None)) and \
-                         getattr(model_config, "quantization", None) == 'inc':
-            print('inc shutdown start')
+        can_finalize_inc = False
+        from contextlib import suppress
+        with suppress(AttributeError):
+            can_finalize_inc = (self.model_config.quantization == 'inc') and \
+                (self.model.model is not None) and \
+                self.inc_initialized_successfully and \
+                not getattr(self, "_is_inc_finalized", False)
+        if can_finalize_inc:
             from neural_compressor.torch.quantization import (
                 finalize_calibration)
             finalize_calibration(self.model.model)
-            print('inc shutdown')
+            self._is_inc_finalized = True
 
     def __del__(self):
         self.shutdown_inc()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 8cdbba02fbb33..2e4dfeac42c3e 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -320,9 +320,6 @@ def list_prompt_adapters(self) -> Set[int]:
     def shutdown_inc(self):
         self.model_runner.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown_inc()
-
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len

From cef2f54b1d369195b5485161f9af941caa11d734 Mon Sep 17 00:00:00 2001
From: Zehao Huang <zehao.huang@intel.com>
Date: Wed, 25 Sep 2024 20:28:48 +0800
Subject: [PATCH 260/341] Setting enough cache_size_limit for torch.compile
 warmup (#238)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the issue that warmup sometimes doesn't work because the default
cache_size_limit is only 8 .

---------

Signed-off-by: zehao-intel <zehao.huang@intel.com>
Co-authored-by: Andrzej Kotłowski <Andrzej.Kotlowski@intel.com>
---
 vllm/worker/habana_model_runner.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6940e7637dbb7..394bb5318d10e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1553,6 +1553,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     len(self.decode_buckets),
                     list(sorted(self.decode_buckets)))
 
+        if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
+            cache_size_limit = len(self.prompt_buckets) + len(
+                self.decode_buckets) + 1
+            torch._dynamo.config.cache_size_limit = max(
+                cache_size_limit, torch._dynamo.config.cache_size_limit)
+            # Multiply by 8 to follow the original default ratio between
+            # the cache_size_limit and accumulated_cache_size_limit
+            torch._dynamo.config.accumulated_cache_size_limit = max(
+                cache_size_limit * 8,
+                torch._dynamo.config.accumulated_cache_size_limit)
+
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
 

From 45ee5863019e955305c3ea545f9a0ccec3ff8bee Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Wed, 25 Sep 2024 15:28:28 +0200
Subject: [PATCH 261/341] Change default values for decode bucket flags (#316)

Change default values for decode bucket flags
---
 README_GAUDI.md                                    | 12 ++++++------
 docs/source/getting_started/gaudi-installation.rst | 12 ++++++------
 vllm/worker/habana_model_runner.py                 | 11 +++++------
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 0ef30d5f96e64..04e2ff22f96e5 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -321,7 +321,7 @@ for graph capture (later referred to as \"usable graph memory\"), and
 the remaining 90% will be utilized for KV cache. Environment variable
 `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
 reserved for prefill and decode graphs. By default
-(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
+(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory
 constraints. Lower value corresponds to less usable graph memory
 reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will
 reserve 20% of usable graph memory for prefill graphs, and 80% of usable
@@ -388,7 +388,7 @@ INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 G
 INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
 ...
 INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
 INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ...
 INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
@@ -448,7 +448,7 @@ Environment variables
 -   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
     HPUGraph capture, `0.1` by default
 -   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
-    dedicated for prompt graphs, `0.5` by default
+    dedicated for prompt graphs, `0.3` by default
 -   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
     graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
 -   `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode
@@ -472,15 +472,15 @@ Environment variables
                     `max_model_len`
 
         - Decode:
-            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)`
+            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
             -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
                     `min(max_num_seqs, 32)`
             -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
                     `max_num_seqs`
             -   block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`):
-                    `128`
+                    `block_size`
             -   block size step
-                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128`
+                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
             -   block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`):
                     `max(128, (max_num_seqs*max_model_len)/block_size)`
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 8c4905e2a488a..db1d8666e4800 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -245,7 +245,7 @@ Only after that, ``gpu_memory_utilization`` flag is utilized - at its default va
 Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
 Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
 With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
-Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
+Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
 
 .. note:: 
@@ -280,7 +280,7 @@ Each described step is logged by vLLM server, as follows (negative values corres
    INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
    ...
    INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5)
+   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
    INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
    ...
    INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
@@ -324,7 +324,7 @@ Environment variables
 
 -   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
 -   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
--   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
+-   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
 -   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
 -   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
 -   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
@@ -343,11 +343,11 @@ Environment variables
          - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
 
       - Decode:
-         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)``
+         - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
          - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128``
-         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128``
+         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
+         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
          - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
 
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 394bb5318d10e..e80df4e7c8c16 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -670,7 +670,6 @@ def _setup_buckets(self) -> None:
         if self.lora_config and \
             max_bucket_cfg > self.max_num_batched_tokens // self.block_size:
             max_bucket_cfg = self.max_num_batched_tokens // self.block_size
-        blocks_step = 128
         #FIXME: The default values should be max_model_len
         max_prompt_seq = 1024
         max_decode_seq = 2048
@@ -682,7 +681,7 @@ def _setup_buckets(self) -> None:
             max=align_bs(max_bucket_cfg))
         self.decode_bs_bucket_cfg = read_bucket_settings('decode',
                                                          'bs',
-                                                         min=align_bs(32),
+                                                         min=1,
                                                          step=align_bs(32),
                                                          max=self.max_num_seqs)
         self.prompt_seq_bucket_cfg = read_bucket_settings('prompt',
@@ -693,9 +692,9 @@ def _setup_buckets(self) -> None:
         self.decode_block_bucket_cfg = read_bucket_settings(
             'decode',
             'block',
-            min=blocks_step,
-            step=blocks_step,
-            max=max(blocks_step,
+            min=self.block_size,
+            step=self.block_size,
+            max=max(self.block_size,
                     self.max_num_seqs * max_decode_seq // self.block_size))
         self.graphed_buckets: Set[Any] = set()
 
@@ -1594,7 +1593,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                 graph_free_mem = align_workers(graph_free_mem,
                                                torch.distributed.ReduceOp.MIN)
                 prompt_graph_mem_ratio = float(
-                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5'))
+                    os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3'))
                 prompt_available_memory = (prompt_graph_mem_ratio *
                                            graph_free_mem)
                 decode_available_memory = (graph_free_mem -

From 29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035 Mon Sep 17 00:00:00 2001
From: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com>
Date: Wed, 25 Sep 2024 19:19:40 +0300
Subject: [PATCH 262/341] Support loading checkpoints quantized using Autofp8
 (#286)

Support loading
https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127

Skip cuda checks
Use scaled_fp8_quant instead of _scaled_mm
Fix weights and weight_scale for guudi2 flot8_e4m3fn range.

---------

Co-authored-by: Nir David <ndavid@habana.ai>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
---
 requirements-hpu.txt                          |  3 +-
 .../layers/fused_moe/fused_moe.py             |  4 ++
 .../compressed_tensors/compressed_tensors.py  |  9 +++--
 .../schemes/compressed_tensors_w8a8_fp8.py    |  4 +-
 .../model_executor/layers/quantization/fp8.py | 24 +++++++----
 .../layers/quantization/utils/w8a8_utils.py   | 40 +++++++++++++++----
 vllm/worker/habana_model_runner.py            |  3 +-
 7 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index c7376a7c504f9..1af5460128fbb 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,4 +6,5 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab
+
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3e01112eaa14d..cf17f1e240e47 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -13,6 +13,10 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+if current_platform.is_hpu():
+    from vllm_hpu_extension.ops import scaled_fp8_quant
+    ops.scaled_fp8_quant = scaled_fp8_quant
+
 logger = init_logger(__name__)
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e536fae45c845..252ad864ced3e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -243,8 +243,10 @@ def _get_scheme_from_parts(
         # TODO @dsikka: clean-up conditions
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
-                is_fp8_w8a8_supported = self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
+                is_fp8_w8a8_supported = current_platform.is_hpu() or \
+                    self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(),
+                    error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
@@ -314,7 +316,8 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        self._check_scheme_supported(scheme.get_min_capability())
+        if not current_platform.is_hpu():
+            self._check_scheme_supported(scheme.get_min_capability())
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 5931ec36c97d5..29f3228c0dc5d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -13,6 +13,7 @@
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.platforms import current_platform
 from vllm.utils import is_hip
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
@@ -23,7 +24,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.cutlass_fp8_supported = not current_platform.is_hpu() and \
+                                     cutlass_fp8_supported()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b5feb55db0e74..88915942220ca 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,6 +28,10 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_hip, print_warning_once
 
+if current_platform.is_hpu():
+    from vllm_hpu_extension.ops import scaled_fp8_quant
+    ops.scaled_fp8_quant = scaled_fp8_quant
+
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
@@ -116,14 +120,18 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
-
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        self.use_marlin = (not current_platform.has_device_capability(89)
-                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-        # Disable marlin for rocm
-        if is_hip():
+        if current_platform.is_cuda_alike():
+            self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+            # For GPUs that lack FP8 hardware support, we can leverage the
+            # Marlin kernel for fast weight-only FP8 quantization
+            self.use_marlin = (not current_platform.has_device_capability(89)
+                               or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+            # Disable marlin for rocm
+            if is_hip():
+                self.use_marlin = False
+        else:
+            self.cutlass_fp8_supported = False
             self.use_marlin = False
 
     def create_weights(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index fb263d121fe55..048962721e26b 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,6 +10,11 @@
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
 
+if current_platform.is_hpu():
+    import habana_frameworks.torch.utils.experimental as htexp
+    from vllm_hpu_extension.ops import scaled_fp8_quant
+    ops.scaled_fp8_quant = scaled_fp8_quant
+
 
 def cutlass_fp8_supported() -> bool:
     # cutlass is not supported on Rocm
@@ -25,7 +30,15 @@ def cutlass_fp8_supported() -> bool:
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
-    fake_qweight = tensor.to(torch.float16)
+    dtype = torch.float16
+    device = tensor.device
+    if current_platform.is_hpu():
+        dtype = torch.bfloat16
+        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
+            #dequant on cpu to avoid nan on gaudi2
+            tensor = tensor.to('cpu')
+
+    fake_qweight = tensor.to(dtype).to(device)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
 
@@ -58,7 +71,10 @@ def requantize_with_max_scale(
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
-
+    if current_platform.is_hpu() and htexp._get_device_type(
+    ) == htexp.synDeviceType.synDeviceGaudi2:
+        max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
+                                     torch.finfo(torch.float8_e4m3fnuz).max)
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
     # N weight scales for N shards but we only load 1 weight scale
@@ -129,12 +145,20 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale,
-                                      bias=bias)
+            if current_platform.is_hpu():
+                #hpu does not support torch._scaled_mm (SW-197036)
+                output = torch.ops.hpu.fp8_gemm_v2(qinput, False, weight,
+                                                   False, None, input.dtype,
+                                                   x_scale, weight_scale, None,
+                                                   False)
+            else:
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          out_dtype=input.dtype,
+                                          scale_a=x_scale,
+                                          scale_b=weight_scale,
+                                          bias=bias)
+
             # A fix for discrepancy in scaled_mm which returns tuple
             # for torch < 2.5 and a single value in torch >= 2.5
             if type(output) is tuple and len(output) == 2:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index e80df4e7c8c16..c43acdf04923b 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -587,8 +587,7 @@ def _set_gc_threshold(self) -> None:
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        if self.model_config.quantization == 'inc':
-            htcore.hpu_set_env()
+        htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(model_config=self.model_config,

From 4c8a6c6092532d8df3f45831d2bfa2715a06507f Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Thu, 26 Sep 2024 20:26:16 +0800
Subject: [PATCH 263/341] Fix torch.compile issue of dispatch key set mismatch
 (#299)

### Issue:
torch.compile recompiles after warmup because `tensor 'L['input_ids']'
dispatch key set mismatch. expected DispatchKeySet(HPU, BackendSelect),
actual DispatchKeySet(HPU, BackendSelect, ADInplaceOrView). `

### Detail:
Run script with `TORCH_LOGS="guards"` and get different dispatch key set
info:
- warmup:
```
TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(HPU, BackendSelect), torch.int64, device=0, requires_grad=False, size=[2, 1], stride=[1, 1])  # masked_input = input_  # ome/zyuwen/workspace/vllm/habana_main_g3_v2/vllm/model_executor/layers/vocab_parallel_embedding.py:358 in forward
```
- after warmup:
```
TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(HPU, BackendSelect, ADInplaceOrView), torch.int64, device=0, requires_grad=False, size=[2, 1], stride=[1, 1])  # masked_input = input_  # ome/zyuwen/workspace/vllm/habana_main_g3_v2/vllm/model_executor/layers/vocab_parallel_embedding.py:358 in forward
```
### Solution:
The difference in dispatch key set is caused by the
'torch.inference_mode()' decoration, and here is a simple example:
```python
import torch
import habana_frameworks.torch as htorch

@torch.inference_mode()
def func():
    x = torch.rand(3, 3).to("hpu")
    print(torch._C._dispatch_key_set(x))
func()
# output: DispatchKeySet(HPU, AutocastHPU)
```
```python
import torch
import habana_frameworks.torch as htorch

def func():
    x = torch.rand(3, 3).to("hpu")
    print(torch._C._dispatch_key_set(x))
func()
# output: DispatchKeySet(HPU, ADInplaceOrView, AutogradHPU, AutocastHPU)
```

In vllm-fork, the warmup phase is decorated with
`torch.inference_mode()` in
[habana_model_runner.py#L1487-L1488](https://github.com/HabanaAI/vllm-fork/blob/b62fba85ac03326e9f466d8d37e91ae1b14a6511/vllm/worker/habana_model_runner.py#L1487-L1488),
but the after-warmup phase is not.

So in this PR I add the decorator to `prepare_input_tensors` function to
keep the dispatch key set the same.


---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 vllm/worker/habana_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index c43acdf04923b..f3bda39ec4822 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1790,6 +1790,7 @@ def make_model_input_from_broadcasted_tensor_dict(
                 attn_backend=self.attn_backend,
             ))
 
+    @torch.inference_mode()
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],

From 1c6bada23884043cdd2a5715bce405bf2bb000f0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 26 Sep 2024 14:53:29 +0200
Subject: [PATCH 264/341] Chunk prefill cache writes, remove div_i32 from
 insert_or_update_cache (#289)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-implements following PRs for current habana_main:
https://github.com/HabanaAI/vllm-fork/pull/102 (Removing div_i32
operations from each layer)
https://github.com/HabanaAI/vllm-fork/pull/115 (removing scatter for
reshape&cache in case of prompt)

Accuracy (GSM8K on Llama3.1-8B-Instruct):
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|

|---------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k_cot_llama| 3|flexible-extract| 8|exact_match|↑ |0.8415|± |0.0101|
| | |strict-match | 8|exact_match|↑ |0.8400|± |0.0101|

I've benchmarked this change on Llama3.1-8B-Instruct and on average,
+2.50% throughput gain (+558.14 tok/s, ~21594 tok/s -> ~22152 tok/s) can
be observed across all prefill buckets on G2, with up to +4.40% (+956.79
tok/s, ~25031 -> ~25988 tok/s) throughput increase in compute-bound
scenarios.
---
 requirements-hpu.txt                    |  3 +--
 vllm/attention/backends/habana_attn.py  | 17 +++++++++--------
 vllm/attention/ops/habana_paged_attn.py |  2 ++
 vllm/worker/habana_model_runner.py      | 22 +++++++++++++++++++++-
 4 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 1af5460128fbb..33619dc4883d5 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,5 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab
\ No newline at end of file
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 59a99b89c293f..dad33fefc51f3 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -8,7 +8,6 @@
 
 import torch
 import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension import cache_ops
 from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -166,6 +165,11 @@ def forward(
         query = query.view(-1, self.num_heads, self.head_size)
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
+        block_indices = attn_metadata.block_indices
+        block_offsets = attn_metadata.block_offsets
+        if attn_metadata.is_prompt:
+            key = key.unflatten(0, (block_indices.size(0), -1))
+            value = value.unflatten(0, (block_indices.size(0), -1))
         if kv_cache is not None:
             key_cache, value_cache = HabanaPagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
@@ -173,13 +177,10 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            num_kv_cache_passes, num_slots_available, indices, offsets = \
-                cache_ops.prepare_to_cache(key_cache,
-                                           attn_metadata.slot_mapping)
-            key_cache = self.k_cache(key, key_cache, num_kv_cache_passes,
-                                     num_slots_available, indices, offsets)
-            value_cache = self.v_cache(value, value_cache, num_kv_cache_passes,
-                                       num_slots_available, indices, offsets)
+            key_cache = self.k_cache(key, key_cache, block_indices,
+                                     block_offsets)
+            value_cache = self.v_cache(value, value_cache, block_indices,
+                                       block_offsets)
 
         if attn_metadata.is_prompt:
             # Prompt run.
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index 49a3e3f774d58..7f080e0727457 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -18,6 +18,8 @@ class HabanaPagedAttentionMetadata:
     block_list: Optional[torch.Tensor]
     block_mapping: Optional[torch.Tensor]
     block_usage: Optional[torch.Tensor]
+    block_indices: Optional[torch.Tensor]
+    block_offsets: Optional[torch.Tensor]
 
 
 class HabanaPagedAttention:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index f3bda39ec4822..d3d2973688843 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -245,6 +245,17 @@ def pad_list(list, k, v):
     return list + [v] * padding
 
 
+def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
+    slot_mapping = slot_mapping.flatten()
+    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    if is_prompt:
+        indices = indices.unflatten(0, (-1, block_size))[:, 0]
+        offsets = None
+    else:
+        offsets = torch.fmod(slot_mapping, block_size)
+    return indices, offsets
+
+
 class HpuModelAdapter():
 
     def __init__(self, model, block_size, dtype, enforce_eager):
@@ -890,11 +901,15 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
+        block_indices, block_offsets = precompute_indices_and_offsets(
+            self.block_size, slot_mapping, True)
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=True,
             block_list=None,
             block_mapping=None,
             block_usage=None,
+            block_indices=block_indices,
+            block_offsets=block_offsets,
             attn_bias=None,
             seq_lens_tensor=seq_lens_tensor,
             num_prefills=real_num_seqs,
@@ -1044,11 +1059,15 @@ def _prepare_decode(
                                     dtype=torch.long,
                                     device=self.device)
 
+        block_indices, block_offsets = precompute_indices_and_offsets(
+            self.block_size, slot_mapping, False)
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             block_list=block_list,
             block_mapping=block_mapping,
             block_usage=block_usage,
+            block_indices=block_indices,
+            block_offsets=block_offsets,
             attn_bias=None,
             seq_lens_tensor=None,
             num_prefills=0,
@@ -1266,7 +1285,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         # input_hash("abc") != input_hash("cba")
         attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
             'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
-            'block_usage', 'slot_mapping', 'is_prompt'
+            'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
+            'block_offsets'
         ])
         return attention_metadata
 

From 5ffcfa3e377c83331f0f062ef90a2ab2f6b40da4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 26 Sep 2024 17:39:49 +0200
Subject: [PATCH 265/341] Update cpu-test.yml

---
 .github/workflows/cpu-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
index 89a702f9751d9..60af77749bb1f 100644
--- a/.github/workflows/cpu-test.yml
+++ b/.github/workflows/cpu-test.yml
@@ -27,6 +27,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements-build.txt
         pip install -r requirements-hpu.txt
         VLLM_TARGET_DEVICE=hpu python setup.py develop
     - name: cpu-test

From c3577af3b52bd93b69dcc224f77179133bcdfc49 Mon Sep 17 00:00:00 2001
From: Vivek Goel <vgoel@habana.ai>
Date: Fri, 27 Sep 2024 12:28:36 +0530
Subject: [PATCH 266/341] Fix runtime errors reported when using long input
 sequence lengths with LoRA (#339)

This PR has following fixes,
- Increase size of indices tensors used to maintain multi-lora state
information from max_num_batched_tokens to 3*max_num_batched_tokens.
This increase is done to provide buffer for padding done in batch &
sequence dimensions.
- Move logic to remove padding from lora_logits from execute_model()
back to Class LogitsProcessorWithLoRA, this is done to fix race
condition caused by updating multi-lora state information directly.

FIX https://github.com/HabanaAI/vllm-fork/issues/237
---
 vllm/lora/layers.py                |  2 ++
 vllm/lora/models.py                |  2 +-
 vllm/worker/habana_model_runner.py | 20 ++++++--------------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index b3758ad883d56..06160367054e4 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1203,6 +1203,8 @@ def _get_logits(
         ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"),
                                                       posinf=float("inf"),
                                                       neginf=float("-inf")))
+        if current_platform.is_hpu():
+            lora_logits = lora_logits[:logits.shape[0], :]
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1], ] = lora_logits
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 546a4c402aedc..582170a2df627 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -432,7 +432,7 @@ def __init__(
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         if current_platform.is_hpu():
             self.punica_wrapper = GaudiPunicaWrapper(
-                max_num_batched_tokens,
+                3 * max_num_batched_tokens,
                 max_batches=self.max_num_seqs,
                 device="hpu")
         else:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index d3d2973688843..bfbe4085ddd3f 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1203,9 +1203,9 @@ def prepare_input_tensors(
 
         if self.lora_config:
             lora_mapping = LoRAMapping(
-                lora_index_mapping,
-                lora_prompt_mapping,
-            )
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=(num_prefills > 0)))
         else:
             lora_mapping = None
 
@@ -1370,9 +1370,9 @@ def warmup_scenario(self,
         times = 3 if use_graphs or is_pt_profiler_run else 1
         if self.lora_config and not is_lora_profile_run:
             lora_mapping = LoRAMapping(
-                [0] * batch_size * seq_len,
-                [0] * batch_size * seq_len,
-            )
+                **dict(index_mapping=[0] * batch_size * seq_len,
+                       prompt_mapping=[0] * batch_size * seq_len,
+                       is_prefill=is_prompt))
             self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
@@ -1915,14 +1915,6 @@ def execute_model(
             )
 
         if self.lora_config:
-            from vllm.lora.layers import VocabParallelEmbeddingWithLoRA
-            modules = unwrap_model(self.model.model)
-            for module in modules:
-                if isinstance(module, VocabParallelEmbeddingWithLoRA):
-                    for i in range(0, len(module.punica_wrapper.indices_len)):
-                        module.punica_wrapper.indices_len[
-                            i] = sampling_metadata.selected_token_indices.numel(
-                            )
             lora_logits_mask: torch.Tensor = model_input.lora_logits_mask
             LoraMask.setLoraMask(
                 lora_logits_mask.index_select(

From ed85058387bdab264de44bee40f1f75ea847db72 Mon Sep 17 00:00:00 2001
From: Yu-Zhou <yu.zhou@intel.com>
Date: Fri, 27 Sep 2024 21:47:23 +0800
Subject: [PATCH 267/341] Enable Async output process for HPU (#342)

FILL IN THE PR DESCRIPTION HERE

This PR refer to [#7049](https://github.com/vllm-project/vllm/pull/7049)
to implement Asynchronous Output Processor on HPU. It is open by
default, to disable it, please pass the `--disable_async_output_proc`
flag.

From my local test on latest habana_main branch(commit
29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035), the throughput improves from
3847 TPS to 4011 TPS.

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to
be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a
href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom
C++ and CUDA Operators</a> and <a
href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The
Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require
meta-functions. Meta-functions should be implemented and registered in
python so that dynamic dims can be handled automatically. See above
documents for a description of meta-functions.</li>
<li>Use <a
href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a>
to test the function registration and meta-function for any registered
ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must
be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a
href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom
Class Support in PT2</a>.
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>
---
 vllm/config.py                     | 5 +++--
 vllm/worker/habana_model_runner.py | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index eef1c2bfb9df9..e732c84c54520 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -372,9 +372,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        if device_config.device_type not in ("cuda", "tpu"):
+        if device_config.device_type not in ("cuda", "tpu", "hpu"):
             logger.warning(
-                "Async output processing is only supported for CUDA or TPU. "
+                "Async output processing is only supported for CUDA, TPU "
+                "and HPU. "
                 "Disabling it for other platforms.")
             self.use_async_output_proc = False
             return
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index bfbe4085ddd3f..f3f679dbd1878 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -428,6 +428,7 @@ class ModelInputForHPU(ModelRunnerInputBase):
     virtual_engine: int = 0
     lora_mask: Optional[torch.Tensor] = None
     lora_logits_mask: Optional[torch.Tensor] = None
+    async_callback: Optional[Callable] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -1934,6 +1935,9 @@ def execute_model(
         if not self.is_driver_worker:
             return []
 
+        if model_input.async_callback is not None:
+            model_input.async_callback()
+
         # Sample the next token.
         with self.profiler.record_event(
                 'internal', ('sample_'

From b611e209eff27383f3b25ec15f667c23008c837d Mon Sep 17 00:00:00 2001
From: Iryna Boiko <iboiko@habana.ai>
Date: Mon, 30 Sep 2024 07:10:32 +0200
Subject: [PATCH 268/341] Port last_bucket change from v1.18.0 (#347)

Port last_bucket change from v1.18.0
---
 vllm/worker/habana_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index f3f679dbd1878..79133aaf8f0f2 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -178,8 +178,7 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
     bs_buckets = warmup_range(bs_bucket_config)
     block_buckets = warmup_range(blocks_bucket_config)
     bmin, bstep, bmax = blocks_bucket_config
-    last_bucket = max_blocks if (max_blocks // bstep
-                                 == 0) else (max_blocks // bstep + 1) * bstep
+    last_bucket = round_up(max_blocks, bstep)
     for bs in bs_buckets:
         for blocks in block_buckets:
             if blocks < bs:

From 3010f8cbd93d6696560f55d18ef6b074ad2535f4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 30 Sep 2024 11:41:21 +0200
Subject: [PATCH 269/341] Add setuptools_scm to requirements-hpu.txt (#349)

This removes the crash during installation for dependency that's inside
requirements-build.txt
---
 requirements-hpu.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 33619dc4883d5..62ff11eba81e2 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,4 +6,6 @@ ray == 2.32.0
 triton
 pandas
 tabulate
+setuptools>=61
+setuptools-scm>=8
 vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7

From 44d8173bb1d94bd20dbfbb75e52483c2296ed28e Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Thu, 19 Sep 2024 08:57:53 +0300
Subject: [PATCH 270/341] test_lora_manager fix

---
 tests/lora/test_lora_manager_hpu.py | 553 ++++++++++++++++++++++++++++
 1 file changed, 553 insertions(+)
 create mode 100644 tests/lora/test_lora_manager_hpu.py

diff --git a/tests/lora/test_lora_manager_hpu.py b/tests/lora/test_lora_manager_hpu.py
new file mode 100644
index 0000000000000..ef3bf5272d709
--- /dev/null
+++ b/tests/lora/test_lora_manager_hpu.py
@@ -0,0 +1,553 @@
+import os
+from typing import Dict, List
+
+import pytest
+import torch
+from safetensors.torch import load_file
+from torch import nn
+
+from vllm.config import LoRAConfig
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
+                              LRUCacheLoRAModelManager)
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
+                                      WorkerLoRAManager)
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+EMBEDDING_MODULES = {
+    "embed_tokens": "input_embeddings",
+    "lm_head": "output_embeddings",
+}
+
+EMBEDDING_PADDING_MODULES = ["lm_head"]
+
+
+def test_from_lora_tensors(sql_lora_files):
+    tensors = load_file(
+        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    new_embeddings = load_file(
+        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+    lora_model = LoRAModel.from_lora_tensors(
+        1,
+        8,
+        16,
+        tensors,
+        "hpu",
+        embeddings=new_embeddings,
+        embedding_modules=EMBEDDING_MODULES,
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+    for module_name, lora in lora_model.loras.items():
+        assert lora.module_name == module_name
+        assert lora.rank == 8
+        assert lora.lora_alpha == 16
+        assert lora.lora_a is not None
+        assert lora.lora_b is not None
+        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
+                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[1] == 8
+        embeddings_module = next(
+            (k for k in EMBEDDING_MODULES if k in module_name), None)
+        if embeddings_module:
+            assert torch.equal(
+                lora.embeddings_tensor,
+                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
+                    device=lora.embeddings_tensor.device))
+        else:
+            assert lora.embeddings_tensor is None
+
+
+def create_lora(lora_id: int, model: nn.Module,
+                sub_modules: List[str]) -> LoRAModel:
+    loras: Dict[str, LoRALayerWeights] = {}
+    for name in sub_modules:
+        w = model.get_submodule(name).weight
+        loras[name] = LoRALayerWeights(
+            name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="hpu"),
+            torch.rand([8, w.shape[0]], device="hpu"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def create_packed_lora(
+    lora_id: int,
+    model: nn.Module,
+    module_name,
+    replaced_module_names,
+    empty_replaced_module_name=None,
+) -> LoRAModel:
+    w = model.get_submodule(module_name).weight
+    loras: Dict[str, LoRALayerWeights] = {}
+    for replaced_module_name in replaced_module_names:
+        if replaced_module_name == empty_replaced_module_name:
+            continue
+        loras[replaced_module_name] = LoRALayerWeights(
+            replaced_module_name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="hpu"),
+            torch.rand([8, w.shape[0] // len(replaced_module_names)],
+                       device="hpu"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def test_replace_submodules(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "layer1.dense2"]
+    model.packed_modules_mapping = {}
+    manager = LoRAModelManager(
+        model, 1, 1, 1,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
+    model = manager.model
+
+    assert isinstance(model.get_submodule("dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
+    assert isinstance(model.get_submodule("layer1.dense2"),
+                      RowParallelLinearWithLoRA)
+
+
+def test_lora_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    with pytest.raises(ValueError):
+        assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_adapter(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] is None
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] is None
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+
+def test_lora_lru_cache_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_adapter(model_lora1)
+    assert not manager.activate_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_adapter(model_lora2)
+    assert not manager.activate_adapter(2)
+    assert manager.add_adapter(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_adapter(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_adapter(model_lora2.id)
+    assert manager.remove_adapter(model_lora1.id)
+    assert not manager.remove_adapter(model_lora1.id)
+    assert manager.add_adapter(model_lora1)
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.add_adapter(model_lora2)
+    assert manager.deactivate_adapter(3)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+    assert manager.activate_adapter(1)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.deactivate_adapter(2)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_adapter(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.pin_adapter(3)
+    assert manager.pin_adapter(1)
+    with pytest.raises(RuntimeError):
+        assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    with pytest.raises(RuntimeError):
+        assert manager.activate_adapter(2)
+
+    assert manager.deactivate_adapter(3)
+    assert manager.pin_adapter(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.remove_adapter(3)
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(3)
+
+
+def test_lru_lora_model_manager(dist_init, dummy_model):
+    # This tests just the LRU cache functionality, everything else is
+    # tested in test_lora_model_manager
+    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # Add up to capacity
+    assert manager.add_adapter(model_lora1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(1)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    # Add over capacity
+    assert manager.add_adapter(model_lora3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(3)
+    assert manager.activate_adapter(4)
+
+    assert set(manager.list_adapters()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    # Add 3 again to move it to the top and then add 2
+    # should return false since it's in already
+    assert not manager.add_adapter(model_lora3)
+    assert not manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {3, 2}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+    # Remove manually
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
+
+    assert set(manager.list_adapters()) == {2}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+
+    assert set(manager.list_adapters()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    assert not manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # pinning
+    assert manager.add_adapter(model_lora3)
+    assert manager.activate_adapter(3)
+    assert manager.add_adapter(model_lora4)
+    assert manager.activate_adapter(4)
+    assert set(manager.list_adapters()) == {3, 4}
+    with pytest.raises(ValueError):
+        assert manager.pin_adapter(1)
+    assert manager.pin_adapter(3)
+    # Remove manually
+    assert manager.remove_adapter(3)
+    assert not manager.remove_adapter(3)
+
+    assert set(manager.list_adapters()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.add_adapter(model_lora1)
+    assert manager.pin_adapter(1)
+    assert manager.add_adapter(model_lora2)
+    assert manager.activate_adapter(2)
+
+    assert set(manager.list_adapters()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.remove_oldest_adapter()
+    assert set(manager.list_adapters()) == {1}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] is None
+
+    with pytest.raises(RuntimeError):
+        assert manager.remove_oldest_adapter()
+
+    assert set(manager.list_adapters()) == {1}
+
+
+def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                          sql_lora_files):
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_adapter_manager = LRUCacheWorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings.model)
+
+    mapping = LoRAMapping([], [])
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_adapter_manager.set_active_adapters([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
+                                sql_lora_files):
+    # Should remove every LoRA not specified in the request.
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_adapter_manager = WorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager.create_lora_manager(
+        llama_2_7b_model_extra_embeddings.model)
+
+    mapping = LoRAMapping([], [])
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {1}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
+
+    worker_adapter_manager.set_active_adapters([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
+    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_adapter_manager.set_active_adapters([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_packed_loras(dist_init, dummy_model_gate_up):
+    model = dummy_model_gate_up
+    model.supported_lora_modules = ["gate_up_proj"]
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    model_lora = create_packed_lora(
+        1,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"])
+    model_lora1 = create_packed_lora(
+        2,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"],
+        empty_replaced_module_name="gate_proj",
+    )
+
+    manager = LoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+    model = manager.model
+
+    assert isinstance(model.get_submodule("gate_up_proj"),
+                      MergedColumnParallelLinearWithLoRA)
+    assert manager.add_adapter(model_lora)
+    assert manager.add_adapter(model_lora1)
+
+    packed_lora = model_lora.get_lora("gate_up_proj")
+    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
+
+    assert torch.allclose(packed_lora.lora_a[0],
+                          model_lora.get_lora("gate_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[0],
+                          model_lora.get_lora("gate_proj").lora_b)
+    assert torch.allclose(packed_lora.lora_a[1],
+                          model_lora.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[1],
+                          model_lora.get_lora("up_proj").lora_b)
+
+    packed_lora1 = model_lora1.get_lora("gate_up_proj")
+    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
+
+    assert packed_lora1.lora_a[0] is None
+    assert packed_lora1.lora_b[0] is None
+    assert torch.allclose(packed_lora1.lora_a[1],
+                          model_lora1.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora1.lora_b[1],
+                          model_lora1.get_lora("up_proj").lora_b)

From 188bd3adaa27a35cf05608e4383037d0ad2cb7e2 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Mon, 23 Sep 2024 10:13:59 +0300
Subject: [PATCH 271/341] Added both hpu and gpu specific changes confest

---
 tests/lora/conftest.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d3ebd15510284..099158798aa56 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -84,12 +84,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
     temp_file = tempfile.mkstemp()[1]
+    if is_hpu():
+        backend_type = "hccl"
+    else:
+        backend_type = "nccl"
     init_distributed_environment(
         world_size=1,
         rank=0,
         distributed_init_method=f"file://{temp_file}",
         local_rank=0,
-        backend="nccl",
+        backend=backend_type,
     )
     initialize_model_parallel(1, 1)
     yield
@@ -259,8 +263,13 @@ def get_model_patched(*, model_config, device_config, **kwargs):
                              device_config=device_config,
                              **kwargs)
 
-    with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    if is_hpu():
+        with patch("vllm.worker.habana_model_runner.get_model", get_model_patched):
+            engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    else:
+        with patch("vllm.worker.model_runner.get_model", get_model_patched):
+            engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+            
     yield engine.llm_engine
     del engine
     cleanup()

From f59495ad30dd838c5b2dbd83154e4182f7f1df16 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Mon, 30 Sep 2024 14:32:35 +0300
Subject: [PATCH 272/341] Added the changes to conftest to fix
 test_lora_manager

---
 tests/lora/conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 099158798aa56..0b7e381075637 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -262,7 +262,6 @@ def get_model_patched(*, model_config, device_config, **kwargs):
         return get_model_old(model_config=model_config,
                              device_config=device_config,
                              **kwargs)
-
     if is_hpu():
         with patch("vllm.worker.habana_model_runner.get_model", get_model_patched):
             engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)

From b0a9d02ea329e8135aa4ace1681f3751ed99e227 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Mon, 30 Sep 2024 14:27:12 +0300
Subject: [PATCH 273/341] Applied the format changes in conftest

---
 tests/lora/conftest.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 0b7e381075637..77fb0897f6113 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -84,10 +84,7 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
     temp_file = tempfile.mkstemp()[1]
-    if is_hpu():
-        backend_type = "hccl"
-    else:
-        backend_type = "nccl"
+    backend_type = "hccl" if is_hpu() else "nccl"
     init_distributed_environment(
         world_size=1,
         rank=0,
@@ -262,8 +259,10 @@ def get_model_patched(*, model_config, device_config, **kwargs):
         return get_model_old(model_config=model_config,
                              device_config=device_config,
                              **kwargs)
+
     if is_hpu():
-        with patch("vllm.worker.habana_model_runner.get_model", get_model_patched):
+        with patch("vllm.worker.habana_model_runner.get_model",
+                   get_model_patched):
             engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
     else:
         with patch("vllm.worker.model_runner.get_model", get_model_patched):

From 70f544c93baacf781fd37bc71c76d71d9fc3b2c8 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Tue, 1 Oct 2024 08:36:03 +0300
Subject: [PATCH 274/341] Resolved format issues in conftest

---
 tests/lora/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 77fb0897f6113..35224d508fab3 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -267,7 +267,7 @@ def get_model_patched(*, model_config, device_config, **kwargs):
     else:
         with patch("vllm.worker.model_runner.get_model", get_model_patched):
             engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
-            
+
     yield engine.llm_engine
     del engine
     cleanup()

From ec34f88ecb68af760aa1cd74f95a7b714e3c8039 Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Tue, 1 Oct 2024 10:14:54 +0300
Subject: [PATCH 275/341] Added changes of HPU flags

---
 tests/lora/conftest.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 35224d508fab3..1c30f4147e8b5 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -24,6 +24,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from vllm.platforms import current_platform
 
 
 class ContextIDInfo(TypedDict):
@@ -48,18 +49,13 @@ class ContextInfo(TypedDict):
 }]
 
 
-def is_hpu():
-    from importlib import util
-    return util.find_spec('habana_frameworks') is not None
-
-
 def cleanup():
     destroy_model_parallel()
     destroy_distributed_environment()
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    if not is_hpu():
+    if not current_platform.is_hpu():
         torch.cuda.empty_cache()
     ray.shutdown()
 
@@ -84,7 +80,7 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
     temp_file = tempfile.mkstemp()[1]
-    backend_type = "hccl" if is_hpu() else "nccl"
+    backend_type = "hccl" if current_platform.is_hpu() else "nccl"
     init_distributed_environment(
         world_size=1,
         rank=0,
@@ -260,7 +256,7 @@ def get_model_patched(*, model_config, device_config, **kwargs):
                              device_config=device_config,
                              **kwargs)
 
-    if is_hpu():
+    if current_platform.is_hpu():
         with patch("vllm.worker.habana_model_runner.get_model",
                    get_model_patched):
             engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)

From da03d8b8fa14fbc1cb276d19849a6c40b86a8b0e Mon Sep 17 00:00:00 2001
From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com>
Date: Thu, 3 Oct 2024 16:09:10 +0530
Subject: [PATCH 276/341] Lora Mask based on lora index (#348)

Changes the filling of lora mask from lora_id to lora_index. This is
needed to ensure that the mask does not fail in case lora id is greater
than max_loras
---
 vllm/worker/habana_model_runner.py | 211 +++++++++++++++--------------
 1 file changed, 113 insertions(+), 98 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 79133aaf8f0f2..2d72be5690664 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -350,8 +350,7 @@ class PreparePromptMetadata(NamedTuple):
     lora_requests: Set[LoRARequest]
     multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]]
     slot_mapping: List[List[int]]
-    lora_mask: Optional[torch.Tensor]
-    lora_logits_mask: Optional[torch.Tensor]
+    lora_ids: List[int]
 
     @classmethod
     def empty(cls):
@@ -365,8 +364,7 @@ def empty(cls):
                                      lora_requests=set(),
                                      multi_modal_kwargs=None,
                                      slot_mapping=[],
-                                     lora_mask=None,
-                                     lora_logits_mask=None)
+                                     lora_ids=[])
 
 
 class PrepareDecodeMetadata(NamedTuple):
@@ -377,8 +375,7 @@ class PrepareDecodeMetadata(NamedTuple):
     lora_prompt_mapping: List[List[int]]
     lora_requests: Set[LoRARequest]
     slot_mapping: List[List[int]]
-    lora_mask: Optional[torch.Tensor]
-    lora_logits_mask: Optional[torch.Tensor]
+    lora_ids: List[int]
 
     @classmethod
     def empty(cls):
@@ -389,8 +386,7 @@ def empty(cls):
                                      lora_prompt_mapping=[],
                                      lora_requests=set(),
                                      slot_mapping=[],
-                                     lora_mask=None,
-                                     lora_logits_mask=None)
+                                     lora_ids=[])
 
 
 # How batches are constructed.
@@ -425,8 +421,7 @@ class ModelInputForHPU(ModelRunnerInputBase):
     real_batch_size: Optional[int] = None
     batch_size_padded: Optional[int] = None
     virtual_engine: int = 0
-    lora_mask: Optional[torch.Tensor] = None
-    lora_logits_mask: Optional[torch.Tensor] = None
+    lora_ids: Optional[List[int]] = None
     async_callback: Optional[Callable] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
@@ -439,8 +434,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "real_batch_size": self.real_batch_size,
             "batch_size_padded": self.batch_size_padded,
             "virtual_engine": self.virtual_engine,
-            "lora_mask": self.lora_mask,
-            "lora_logits_mask": self.lora_logits_mask,
+            "lora_ids": self.lora_ids,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         return tensor_dict
@@ -474,8 +468,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
             "lora_requests": self.lora_requests,
             "lora_mapping": self.lora_mapping,
             "multi_modal_kwargs": self.multi_modal_kwargs,
-            "lora_mask": self.lora_mask,
-            "lora_logits_mask": self.lora_logits_mask,
+            "lora_ids": self.lora_ids,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,
@@ -836,38 +829,14 @@ def _prepare_prompt(
             find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
             self.block_size)
 
-        lora_mask: torch.Tensor = None
-        lora_logits_mask: torch.Tensor = None
-        counter = 0
-        if self.lora_config:
-            lora_mask = torch.zeros(
-                len(seq_group_metadata_list) * max_prompt_len,
-                (self.lora_config.max_loras) * self.lora_config.max_lora_rank,
-                dtype=self.lora_config.lora_dtype)
-            lora_logits_mask = torch.zeros(len(seq_group_metadata_list),
-                                           (self.lora_config.max_loras) *
-                                           self.lora_config.max_lora_rank,
-                                           dtype=self.lora_config.lora_dtype)
-
-            ones = torch.ones(max_prompt_len,
-                              self.lora_config.max_lora_rank,
-                              dtype=self.lora_config.lora_dtype)
-            logit_ones = torch.ones(1,
-                                    self.lora_config.max_lora_rank,
-                                    dtype=self.lora_config.lora_dtype)
+        lora_ids: List[int] = []
         for seq_group_metadata, context_len in zip(seq_group_metadata_list,
                                                    context_lens):
             lora_id = seq_group_metadata.lora_int_id
+            lora_ids.append(lora_id)
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
-                start_row = counter * max_prompt_len
-                end_row = start_row + max_prompt_len
-                start_col = (lora_id - 1) * self.lora_config.max_lora_rank
-                end_col = start_col + self.lora_config.max_lora_rank
-                lora_mask[start_row:end_row, start_col:end_col] = ones
-                lora_logits_mask[counter, start_col:end_col] = logit_ones
-            counter = counter + 1
 
             lora_index_mapping += [lora_id] * (max_prompt_len - context_len)
             lora_prompt_mapping.extend(
@@ -875,10 +844,6 @@ def _prepare_prompt(
                 (max_prompt_len - context_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
-        if lora_mask is not None:
-            lora_mask = lora_mask.to('hpu')
-            lora_logits_mask = lora_logits_mask.to('hpu')
-
         input_tokens = make_tensor_with_pad(input_tokens,
                                             max_len=max_prompt_len,
                                             pad=0,
@@ -919,20 +884,17 @@ def _prepare_prompt(
         )
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
 
-        return PreparePromptMetadata(
-            input_tokens=input_tokens,
-            input_positions=input_positions,
-            attn_metadata=attn_metadata,
-            seq_lens=seq_lens,
-            query_lens=query_lens,
-            lora_index_mapping=lora_index_mapping,
-            lora_prompt_mapping=lora_prompt_mapping,
-            lora_requests=lora_requests,
-            multi_modal_kwargs=multi_modal_kwargs,
-            slot_mapping=slot_mapping,
-            lora_mask=lora_mask,
-            lora_logits_mask=lora_logits_mask,
-        )
+        return PreparePromptMetadata(input_tokens=input_tokens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     seq_lens=seq_lens,
+                                     query_lens=query_lens,
+                                     lora_index_mapping=lora_index_mapping,
+                                     lora_prompt_mapping=lora_prompt_mapping,
+                                     lora_requests=lora_requests,
+                                     multi_modal_kwargs=multi_modal_kwargs,
+                                     slot_mapping=slot_mapping,
+                                     lora_ids=lora_ids)
 
     def _prepare_decode(
         self,
@@ -949,18 +911,7 @@ def _prepare_decode(
 
         if len(seq_group_metadata_list) == 0:
             return PrepareDecodeMetadata.empty()
-        lora_mask: torch.Tensor = None
-        lora_logits_mask: torch.Tensor = None
-        counter = 0
-
-        if self.lora_config:
-            lora_mask = torch.zeros(len(seq_group_metadata_list),
-                                    (self.lora_config.max_loras) *
-                                    self.lora_config.max_lora_rank,
-                                    dtype=self.lora_config.lora_dtype)
-            ones = torch.ones(1,
-                              self.lora_config.max_lora_rank,
-                              dtype=self.lora_config.lora_dtype)
+        lora_ids: List[int] = []
 
         dummy_slots = itertools.cycle(
             range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size))
@@ -971,13 +922,10 @@ def _prepare_decode(
 
             seq_ids = list(seq_group_metadata.seq_data.keys())
             lora_id = seq_group_metadata.lora_int_id
+            lora_ids.append(lora_id)
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
-                start_pos = (lora_id - 1) * self.lora_config.max_lora_rank
-                end_pos = start_pos + self.lora_config.max_lora_rank
-                lora_mask[counter, start_pos:end_pos] = ones
-            counter = counter + 1
 
             for seq_id in seq_ids:
                 seq_data = seq_group_metadata.seq_data[seq_id]
@@ -1012,9 +960,6 @@ def _prepare_decode(
                     block_table = block_table[-sliding_window_blocks:]
                 block_tables.append(block_table)
 
-        if lora_mask is not None:
-            lora_mask = lora_mask.to('hpu')
-            lora_logits_mask = lora_mask
         input_tokens = torch.tensor(input_tokens,
                                     dtype=torch.long,
                                     device=self.device)
@@ -1075,17 +1020,14 @@ def _prepare_decode(
             num_decode_tokens=num_decode_tokens,
             slot_mapping=slot_mapping,
         )
-        return PrepareDecodeMetadata(
-            input_tokens=input_tokens,
-            input_positions=input_positions,
-            attn_metadata=attn_metadata,
-            lora_index_mapping=lora_index_mapping,
-            lora_prompt_mapping=lora_prompt_mapping,
-            lora_requests=lora_requests,
-            slot_mapping=slot_mapping,
-            lora_mask=lora_mask,
-            lora_logits_mask=lora_logits_mask,
-        )
+        return PrepareDecodeMetadata(input_tokens=input_tokens,
+                                     input_positions=input_positions,
+                                     attn_metadata=attn_metadata,
+                                     lora_index_mapping=lora_index_mapping,
+                                     lora_prompt_mapping=lora_prompt_mapping,
+                                     lora_requests=lora_requests,
+                                     slot_mapping=slot_mapping,
+                                     lora_ids=lora_ids)
 
     def prepare_input_tensors(
         self,
@@ -1142,8 +1084,7 @@ def prepare_input_tensors(
             lora_requests,
             multi_modal_kwargs,
             slot_mapping,
-            lora_mask,
-            lora_logits_mask,
+            lora_ids,
         ) = self._prepare_prompt(prefill_reqs)
         (
             decode_input_tokens,
@@ -1153,8 +1094,7 @@ def prepare_input_tensors(
             decode_lora_prompt_mapping,
             decode_lora_requests,
             decode_slot_mapping,
-            decode_lora_mask,
-            decode_lora_logits_mask,
+            decode_lora_ids,
         ) = self._prepare_decode(decode_reqs)
         sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
                                                      seq_lens, query_lens,
@@ -1181,8 +1121,7 @@ def prepare_input_tensors(
             lora_index_mapping = decode_lora_index_mapping
             lora_prompt_mapping = decode_lora_prompt_mapping
             lora_requests = decode_lora_requests
-            lora_mask = decode_lora_mask
-            lora_logits_mask = decode_lora_logits_mask
+            lora_ids = decode_lora_ids
 
         # FIXME: We need to adjust selected_token_indices to accommodate
         # for padding
@@ -1252,8 +1191,7 @@ def prepare_input_tensors(
                                      multi_modal_kwargs=multi_modal_kwargs,
                                      real_batch_size=real_batch_size,
                                      batch_size_padded=batch_size_padded,
-                                     lora_mask=lora_mask,
-                                     lora_logits_mask=lora_logits_mask), \
+                                     lora_ids=lora_ids), \
                                         sampling_metadata
 
     def _seq_len(self, attn_metadata):
@@ -1853,6 +1791,76 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
             logger.warning("Configuration: (%s, %s, %s) was not warmed-up!",
                            phase, batch_size, seq_len)
 
+    def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
+                         is_prompt: bool):
+        '''
+        This is a helper function to create the mask for lora computations.
+        Lora Mask is needed to ensure we match the correct lora weights for the
+        for the request.
+        For Prompt phase we have 
+        lora_mask with shape (batch_size * seq_len, max_loras * max_rank)
+        lora_logits_mask with shape (batch_size, max_loras * max_rank)
+        For Decode phase we have both
+        lora_mask and lora_logits_mask with shape
+        (batch_size, max_loras * max_rank)
+        '''
+        lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
+        lora_index = 0
+
+        if self.lora_config:
+            if is_prompt:
+                lora_mask = torch.zeros(
+                    input_tokens.shape[0] * input_tokens.shape[1],
+                    (self.lora_config.max_loras) *\
+                        self.lora_config.max_lora_rank,
+                    dtype=self.lora_config.lora_dtype)
+                lora_logits_mask = torch.zeros(
+                    input_tokens.shape[0], (self.lora_config.max_loras) *
+                    self.lora_config.max_lora_rank,
+                    dtype=self.lora_config.lora_dtype)
+
+                ones = torch.ones(input_tokens.shape[1],
+                                  self.lora_config.max_lora_rank,
+                                  dtype=self.lora_config.lora_dtype)
+                logit_ones = torch.ones(1,
+                                        self.lora_config.max_lora_rank,
+                                        dtype=self.lora_config.lora_dtype)
+
+                for i in range(len(lora_ids)):
+                    if lora_ids[i] == 0:
+                        continue
+                    lora_index = self.lora_manager._adapter_manager.\
+                        lora_index_to_id.index(lora_ids[i])
+                    start_row = i * input_tokens.shape[1]
+                    end_row = start_row + input_tokens.shape[1]
+                    start_col = lora_index * self.lora_config.max_lora_rank
+                    end_col = start_col + self.lora_config.max_lora_rank
+                    lora_mask[start_row:end_row, start_col:end_col] = ones
+                    lora_logits_mask[i, start_col:end_col] = logit_ones
+                lora_mask = lora_mask.to('hpu')
+                lora_logits_mask = lora_logits_mask.to('hpu')
+            else:
+                lora_mask = torch.zeros(input_tokens.shape[0],
+                                        (self.lora_config.max_loras) *
+                                        self.lora_config.max_lora_rank,
+                                        dtype=self.lora_config.lora_dtype)
+                ones = torch.ones(1,
+                                  self.lora_config.max_lora_rank,
+                                  dtype=self.lora_config.lora_dtype)
+                for i in range(len(lora_ids)):
+                    if lora_ids[i] == 0:
+                        continue
+                    lora_index = self.lora_manager._adapter_manager.\
+                        lora_index_to_id.index(lora_ids[i])
+                    start_pos = lora_index * self.lora_config.max_lora_rank
+                    end_pos = start_pos + self.lora_config.max_lora_rank
+                    lora_mask[i, start_pos:end_pos] = ones
+                lora_mask = lora_mask.to('hpu')
+                lora_logits_mask = lora_mask
+
+        return lora_mask, lora_logits_mask
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1887,13 +1895,21 @@ def execute_model(
         seq_len = self._seq_len(attn_metadata)
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
         self._check_config(batch_size, seq_len, is_prompt, warmup_mode)
+
+        lora_mask: torch.Tensor = None
+        lora_logits_mask: torch.Tensor = None
+        if self.lora_config:
+            assert model_input.lora_ids is not None
+            lora_mask, lora_logits_mask = self.create_lora_mask(
+                input_tokens, model_input.lora_ids, attn_metadata.is_prompt)
+
         execute_model_kwargs = {
             "input_ids": input_tokens,
             "positions": input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
             "intermediate_tensors": intermediate_tensors,
-            "lora_mask": model_input.lora_mask,
+            "lora_mask": lora_mask,
             **(model_input.multi_modal_kwargs or {}),
         }
         if htorch.utils.internal.is_lazy():
@@ -1915,7 +1931,6 @@ def execute_model(
             )
 
         if self.lora_config:
-            lora_logits_mask: torch.Tensor = model_input.lora_logits_mask
             LoraMask.setLoraMask(
                 lora_logits_mask.index_select(
                     0, sampling_metadata.selected_token_indices))

From f848d27b24d307e872bfed7572659882b341efaa Mon Sep 17 00:00:00 2001
From: Karol Damaszke <kdamaszke@habana.ai>
Date: Thu, 3 Oct 2024 15:47:47 +0200
Subject: [PATCH 277/341] Add rope_scaling support for LLama3.1 (#356)

Add support for rope scaling and FusedRoPE in LLama3.1
---
 requirements-hpu.txt                          |  2 +-
 .../model_executor/layers/rotary_embedding.py | 26 ++++++++++++++-----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 62ff11eba81e2..602a5060c29aa 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb56d3b
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 5b746ae928b16..30bcf954c99b5 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -31,7 +31,8 @@
 from vllm.platforms import current_platform
 
 if current_platform.is_hpu():
-    from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
+    from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding,
+                                                 HpuRotaryEmbedding)
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -943,12 +944,23 @@ def get_rope(
             high_freq_factor = rope_scaling["high_freq_factor"]
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
-            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
-                                               max_position, base,
-                                               is_neox_style, dtype,
-                                               scaling_factor, low_freq_factor,
-                                               high_freq_factor,
-                                               original_max_position)
+            if current_platform.is_hpu():
+                rotary_emb = HpuLlama3RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    low_freq_factor,
+                    high_freq_factor,
+                    original_max_position,
+                    RoPEFallback=Llama3RotaryEmbedding)
+            else:
+                rotary_emb = Llama3RotaryEmbedding(
+                    head_size, rotary_dim, max_position, base, is_neox_style,
+                    dtype, scaling_factor, low_freq_factor, high_freq_factor,
+                    original_max_position)
         elif scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
                                                       max_position, base,

From d8ba780f233ebb3a66fad1dbd879d6ac29a116df Mon Sep 17 00:00:00 2001
From: Marcin Swiniarski <mswiniarski@habana.ai>
Date: Fri, 4 Oct 2024 10:33:42 +0200
Subject: [PATCH 278/341] [Core] Support Torch profiler in Habana Worker (#357)

This PR allows to profile execution on HPU through flag
VLLM_TORCH_PROFILER_DIR. Similar as it is done for GPU.
The profiling can be controlled:
1. Asynchronously by posting requests to the server:
a) to start collecting profile:
`
curl -X POST http://localhost:8080/start_profile
`
b) to stop collecting profile:
`
curl -X POST http://localhost:8080/stop_profile
`
2. In script, by instructing LLM object to start and stop profiling:
```python
from vllm import LLM, SamplingParams
llm = LLM(...)
llm.start_profile()
llm.stop_profile()
```
---
 vllm/engine/async_llm_engine.py       |  7 +++++--
 vllm/engine/llm_engine.py             |  7 +++++--
 vllm/engine/multiprocessing/engine.py |  7 +++++--
 vllm/executor/habana_executor.py      |  6 ++++++
 vllm/worker/habana_worker.py          | 27 +++++++++++++++++++++++++++
 5 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9e6eecf992520..cb489084f48de 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -16,6 +16,7 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
+from vllm.executor.habana_executor import HabanaExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -1204,7 +1205,8 @@ def remove_logger(self, logger_name: str) -> None:
     async def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync or \
+            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
@@ -1212,7 +1214,8 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync or \
+            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5cb0161b73e2c..f41d074ad536c 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,6 +28,7 @@
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
+from vllm.executor.habana_executor import HabanaExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
                          InputRegistry, LLMInputs, PromptType)
@@ -1794,7 +1795,8 @@ def check_health(self) -> None:
     def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor or \
+            type(self.model_executor) == HabanaExecutor:  # noqa: E721
             self.model_executor.start_profile()
         else:
             self.model_executor._run_workers("start_profile")
@@ -1802,7 +1804,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor or \
+            type(self.model_executor) == HabanaExecutor:  # noqa: E721
             self.model_executor.stop_profile()
         else:
             self.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index eecca82cd2f7d..49500099fbcaf 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -23,6 +23,7 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
+from vllm.executor.habana_executor import HabanaExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -364,13 +365,15 @@ def _alive(self):
         self._last_alive_time = time.time()
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor or \
+                type(self.engine.model_executor) is HabanaExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor or \
+                type(self.engine.model_executor) is HabanaExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index e4bd54f8849b3..e6d0fbc0d431d 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -192,6 +192,12 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def start_profile(self) -> None:
+        self.driver_worker.start_profile()
+
+    def stop_profile(self) -> None:
+        self.driver_worker.stop_profile()
+
     def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 2e4dfeac42c3e..7fc1e48b8c960 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -11,6 +11,7 @@
 import torch.distributed
 from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
 
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig,
@@ -95,6 +96,32 @@ def __init__(
         self.cache_engine: List[CacheEngine]
         # Initialize gpu_cache as embedding models don't initialize kv_caches
         self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.HPU,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
 
     def _set_env_vars(self):
         local_rank = self.local_rank

From 250487b567a889c8936acb119131b84fb242e423 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 18:35:25 +0300
Subject: [PATCH 279/341] [Refactor] Rename components *Habana* -> *HPU*

---
 vllm/engine/async_llm_engine.py                  | 16 ++++++++--------
 vllm/engine/llm_engine.py                        | 14 +++++++-------
 vllm/engine/multiprocessing/engine.py            |  6 +++---
 .../{habana_executor.py => hpu_executor.py}      |  8 ++++----
 ...ay_habana_executor.py => ray_hpu_executor.py} |  8 ++++----
 ...abana_model_runner.py => hpu_model_runner.py} |  8 ++++----
 vllm/worker/{habana_worker.py => hpu_worker.py}  |  6 +++---
 vllm/worker/kzawora.code-workspace               | 11 +++++++++++
 8 files changed, 44 insertions(+), 33 deletions(-)
 rename vllm/executor/{habana_executor.py => hpu_executor.py} (97%)
 rename vllm/executor/{ray_habana_executor.py => ray_hpu_executor.py} (99%)
 rename vllm/worker/{habana_model_runner.py => hpu_model_runner.py} (99%)
 rename vllm/worker/{habana_worker.py => hpu_worker.py} (99%)
 create mode 100644 vllm/worker/kzawora.code-workspace

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index cb489084f48de..a2a940148b87e 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -16,7 +16,7 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.executor.habana_executor import HabanaExecutorAsync
+from vllm.executor.hpu_executor import HPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -620,12 +620,12 @@ def _get_executor_cls(
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_habana_executor import (
-                    RayHabanaExecutorAsync)
-                executor_class = RayHabanaExecutorAsync
+                from vllm.executor.ray_hpu_executor import (
+                    RayHPUExecutorAsync)
+                executor_class = RayHPUExecutorAsync
             else:
-                from vllm.executor.habana_executor import HabanaExecutorAsync
-                executor_class = HabanaExecutorAsync
+                from vllm.executor.hpu_executor import HPUExecutorAsync
+                executor_class = HPUExecutorAsync
         elif engine_config.device_config.device_type == "openvino":
             assert distributed_executor_backend is None, (
                 "Distributed execution is not supported with "
@@ -1206,7 +1206,7 @@ async def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
         if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
+            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
@@ -1215,7 +1215,7 @@ async def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
         if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
+            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f41d074ad536c..3635443421e88 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,7 +28,7 @@
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.habana_executor import HabanaExecutor
+from vllm.executor.hpu_executor import HPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
                          InputRegistry, LLMInputs, PromptType)
@@ -533,11 +533,11 @@ def _get_executor_cls(cls,
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_habana_executor import RayHabanaExecutor
-                executor_class = RayHabanaExecutor
+                from vllm.executor.ray_hpu_executor import RayHPUExecutor
+                executor_class = RayHPUExecutor
             else:
-                from vllm.executor.habana_executor import HabanaExecutor
-                executor_class = HabanaExecutor
+                from vllm.executor.hpu_executor import HPUExecutor
+                executor_class = HPUExecutor
         elif engine_config.device_config.device_type == "openvino":
             from vllm.executor.openvino_executor import OpenVINOExecutor
             executor_class = OpenVINOExecutor
@@ -1796,7 +1796,7 @@ def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
         if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HabanaExecutor:  # noqa: E721
+            type(self.model_executor) == HPUExecutor:  # noqa: E721
             self.model_executor.start_profile()
         else:
             self.model_executor._run_workers("start_profile")
@@ -1805,7 +1805,7 @@ def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
         if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HabanaExecutor:  # noqa: E721
+            type(self.model_executor) == HPUExecutor:  # noqa: E721
             self.model_executor.stop_profile()
         else:
             self.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49500099fbcaf..3501f12c065cf 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -23,7 +23,7 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.habana_executor import HabanaExecutor
+from vllm.executor.hpu_executor import HPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -366,14 +366,14 @@ def _alive(self):
 
     def start_profile(self) -> None:
         if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HabanaExecutor:
+                type(self.engine.model_executor) is HPUExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
         if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HabanaExecutor:
+                type(self.engine.model_executor) is HPUExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/hpu_executor.py
similarity index 97%
rename from vllm/executor/habana_executor.py
rename to vllm/executor/hpu_executor.py
index e6d0fbc0d431d..cc5609ebe5c8e 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/hpu_executor.py
@@ -21,7 +21,7 @@
 logger = init_logger(__name__)
 
 
-class HabanaExecutor(ExecutorBase):
+class HPUExecutor(ExecutorBase):
 
     uses_ray: bool = False
 
@@ -57,8 +57,8 @@ def _create_worker(self,
                        rank: int = 0,
                        distributed_init_method: Optional[str] = None):
         wrapper = WorkerWrapperBase(
-            worker_module_name="vllm.worker.habana_worker",
-            worker_class_name="HabanaWorker",
+            worker_module_name="vllm.worker.hpu_worker",
+            worker_class_name="HPUWorker",
         )
         wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
                                                       distributed_init_method))
@@ -202,7 +202,7 @@ def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
 
-class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
+class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
         self,
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_hpu_executor.py
similarity index 99%
rename from vllm/executor/ray_habana_executor.py
rename to vllm/executor/ray_hpu_executor.py
index 645bceb1af446..343fa43b0eda1 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -29,7 +29,7 @@
 logger = init_logger(__name__)
 
 
-class RayHabanaExecutor(DistributedGPUExecutor):
+class RayHPUExecutor(DistributedGPUExecutor):
 
     uses_ray: bool = True
 
@@ -90,8 +90,8 @@ def _get_worker_module_and_class(
             raise NotImplementedError(
                 "Speculative decoding is not implemented for HPU")
         else:
-            worker_module_name = "vllm.worker.habana_worker"
-            worker_class_name = "HabanaWorker"
+            worker_module_name = "vllm.worker.hpu_worker"
+            worker_class_name = "HPUWorker"
         return (worker_module_name, worker_class_name, worker_class_fn)
 
     def _get_worker_wrapper_args(self) -> Dict[str, Any]:
@@ -479,7 +479,7 @@ def __del__(self):
         self.shutdown()
 
 
-class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
+class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/hpu_model_runner.py
similarity index 99%
rename from vllm/worker/habana_model_runner.py
rename to vllm/worker/hpu_model_runner.py
index 2d72be5690664..2ee3832e6e076 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -489,7 +489,7 @@ def from_broadcasted_tensor_dict(
         return cls(**tensor_dict)
 
 
-class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
+class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
     """
     Helper class for shared methods between GPU model runners.
     """
@@ -1730,8 +1730,8 @@ def unwrap_model(model):
         return modules
 
 
-class HabanaModelRunner(
-        HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
+class HPUModelRunner(
+        HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
     """
     GPU model runner with sampling step.
     """
@@ -1872,7 +1872,7 @@ def execute_model(
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError(
-                "num_steps > 1 is not supported in HabanaModelRunner")
+                "num_steps > 1 is not supported in HPUModelRunner")
 
         if self.lora_config:
             assert model_input.lora_requests is not None
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/hpu_worker.py
similarity index 99%
rename from vllm/worker/habana_worker.py
rename to vllm/worker/hpu_worker.py
index 7fc1e48b8c960..59a5adf65ebc1 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -25,14 +25,14 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
 from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.habana_model_runner import HabanaModelRunner
+from vllm.worker.hpu_model_runner import HPUModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
 from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 logger = init_logger(__name__)
 
 
-class HabanaWorker(LocalOrDistributedWorkerBase):
+class HPUWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a HPU.
 
     Each worker is associated with a single HPU. The worker is responsible for
@@ -79,7 +79,7 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner: HabanaModelRunner = HabanaModelRunner(
+        self.model_runner: HPUModelRunner = HPUModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
diff --git a/vllm/worker/kzawora.code-workspace b/vllm/worker/kzawora.code-workspace
new file mode 100644
index 0000000000000..d5ced898f0957
--- /dev/null
+++ b/vllm/worker/kzawora.code-workspace
@@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": "../../.."
+		},
+		{
+			"path": "../.."
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file

From eb095b3f4f98d4a64657da5bb4e4d3c825527d33 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 18:38:31 +0300
Subject: [PATCH 280/341] oopsie

---
 vllm/worker/kzawora.code-workspace | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 vllm/worker/kzawora.code-workspace

diff --git a/vllm/worker/kzawora.code-workspace b/vllm/worker/kzawora.code-workspace
deleted file mode 100644
index d5ced898f0957..0000000000000
--- a/vllm/worker/kzawora.code-workspace
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-	"folders": [
-		{
-			"path": "../../.."
-		},
-		{
-			"path": "../.."
-		}
-	],
-	"settings": {}
-}
\ No newline at end of file

From 65fa6f6bfa733c3cb64e090d9624e9afa335b1cf Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 18:42:14 +0300
Subject: [PATCH 281/341] format.sh

---
 vllm/engine/async_llm_engine.py | 3 +--
 vllm/worker/hpu_model_runner.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index a2a940148b87e..3ba73b68580fb 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -620,8 +620,7 @@ def _get_executor_cls(
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import (
-                    RayHPUExecutorAsync)
+                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync  
                 executor_class = RayHPUExecutorAsync
             else:
                 from vllm.executor.hpu_executor import HPUExecutorAsync
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2ee3832e6e076..b1b62e6bde7f6 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1730,8 +1730,7 @@ def unwrap_model(model):
         return modules
 
 
-class HPUModelRunner(
-        HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
+class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
     """
     GPU model runner with sampling step.
     """

From 05763607d8da304a12e0f218d97ae26d2b169e36 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 18:44:52 +0300
Subject: [PATCH 282/341] make yapf happy

---
 vllm/engine/async_llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3ba73b68580fb..6f3b73dbeee20 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -620,7 +620,7 @@ def _get_executor_cls(
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync  
+                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
                 executor_class = RayHPUExecutorAsync
             else:
                 from vllm.executor.hpu_executor import HPUExecutorAsync

From b4e26d3af5293c38cea95233defc7c834fc2b3fd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 19:16:59 +0300
Subject: [PATCH 283/341] fix sampler metadata generation

---
 vllm/model_executor/sampling_metadata.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index ee02368bec8a8..84f35f75a0c32 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -284,7 +284,8 @@ def _prepare_seq_groups(
         else:
             # Decode
             prompt_logprob_len = 0
-            query_len = query_lens[i] if query_lens is not None else 1
+            query_len = query_lens[i] if query_lens is not None and len(
+                query_lens) > 0 else 1
             sample_len = len(seq_ids) * query_len if do_sample else 0
 
             if sampling_params.seed is not None and generators is not None:

From cfe231d905fe9e3ecf779eaf62e5d177900a0e6e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 18:35:34 +0200
Subject: [PATCH 284/341] [Refactor] Rename components *Habana* -> *HPU* (#359)

Refactoring Gaudi-specific components to use `hpu` name instead of
`habana` (e.g. `habana_model_runner.py` -> `hpu_model_runner.py`,
`habana_executor.py` -> `hpu_executor.py`, etc.), as suggested in the
upstream PR.
---
 README_GAUDI.md                               | 78 +++++++++----------
 .../getting_started/gaudi-installation.rst    | 78 +++++++++----------
 vllm/engine/async_llm_engine.py               | 15 ++--
 vllm/engine/llm_engine.py                     | 14 ++--
 vllm/engine/multiprocessing/engine.py         |  6 +-
 .../{habana_executor.py => hpu_executor.py}   |  8 +-
 ...habana_executor.py => ray_hpu_executor.py} |  8 +-
 ...na_model_runner.py => hpu_model_runner.py} |  7 +-
 .../{habana_worker.py => hpu_worker.py}       |  6 +-
 9 files changed, 109 insertions(+), 111 deletions(-)
 rename vllm/executor/{habana_executor.py => hpu_executor.py} (97%)
 rename vllm/executor/{ray_habana_executor.py => ray_hpu_executor.py} (99%)
 rename vllm/worker/{habana_model_runner.py => hpu_model_runner.py} (99%)
 rename vllm/worker/{habana_worker.py => hpu_worker.py} (99%)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 04e2ff22f96e5..6ba3bb50d4a04 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -195,10 +195,10 @@ batch size and sequence length dimension. These parameters can be
 observed in logs during vLLM startup:
 
 ``` {.}
-INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
 `min` determines the lowest value of the bucket. `step` determines the
@@ -267,17 +267,17 @@ graph compilation overheads within bucket boundaries during server
 runtime. Each warmup step is logged during vLLM startup:
 
 ``` {.}
-INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
 ...
-INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
 ...
-INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
 This example uses the same buckets as in *Bucketing mechanism* section.
@@ -374,35 +374,35 @@ Each described step is logged by vLLM server, as follows (negative
 values correspond to memory being released):
 
 ``` {.}
-INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
 ...
-INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ...
-INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
 ...
-INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
 ```
 
 Recommended vLLM Parameters
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index db1d8666e4800..5915de92802d9 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -173,10 +173,10 @@ Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max`
 
 .. code-block::
 
-      INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-      INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-      INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-      INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+      INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+      INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+      INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+      INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 
 ``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
 
@@ -216,17 +216,17 @@ Warmup is an optional, but highly recommended step occurring before vLLM server
 
 .. code-block::
 
-   INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-   INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-   INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+   INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+   INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
    ...
-   INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-   INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-   INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+   INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+   INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+   INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
    ...
-   INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-   INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+   INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 
 This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. 
 
@@ -266,35 +266,35 @@ Each described step is logged by vLLM server, as follows (negative values corres
 
 .. code-block::
 
-   INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-   INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-   INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-   INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-   INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-   INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-   INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+   INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+   INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+   INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+   INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+   INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+   INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
    ...
-   INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-   INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-   INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+   INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+   INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
    ...
-   INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-   INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+   INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+   INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
    ...
-   INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-   INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-   INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-   INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-   INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-   INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-   INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-   INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-   INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+   INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+   INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+   INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+   INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+   INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+   INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
 
 
 Recommended vLLM Parameters
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index cb489084f48de..6f3b73dbeee20 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -16,7 +16,7 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.executor.habana_executor import HabanaExecutorAsync
+from vllm.executor.hpu_executor import HPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -620,12 +620,11 @@ def _get_executor_cls(
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_habana_executor import (
-                    RayHabanaExecutorAsync)
-                executor_class = RayHabanaExecutorAsync
+                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
+                executor_class = RayHPUExecutorAsync
             else:
-                from vllm.executor.habana_executor import HabanaExecutorAsync
-                executor_class = HabanaExecutorAsync
+                from vllm.executor.hpu_executor import HPUExecutorAsync
+                executor_class = HPUExecutorAsync
         elif engine_config.device_config.device_type == "openvino":
             assert distributed_executor_backend is None, (
                 "Distributed execution is not supported with "
@@ -1206,7 +1205,7 @@ async def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
         if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
+            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
@@ -1215,7 +1214,7 @@ async def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
         if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
+            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f41d074ad536c..3635443421e88 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,7 +28,7 @@
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.habana_executor import HabanaExecutor
+from vllm.executor.hpu_executor import HPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
                          InputRegistry, LLMInputs, PromptType)
@@ -533,11 +533,11 @@ def _get_executor_cls(cls,
         elif engine_config.device_config.device_type == "hpu":
             if distributed_executor_backend == "ray":
                 initialize_ray_cluster(engine_config.parallel_config)
-                from vllm.executor.ray_habana_executor import RayHabanaExecutor
-                executor_class = RayHabanaExecutor
+                from vllm.executor.ray_hpu_executor import RayHPUExecutor
+                executor_class = RayHPUExecutor
             else:
-                from vllm.executor.habana_executor import HabanaExecutor
-                executor_class = HabanaExecutor
+                from vllm.executor.hpu_executor import HPUExecutor
+                executor_class = HPUExecutor
         elif engine_config.device_config.device_type == "openvino":
             from vllm.executor.openvino_executor import OpenVINOExecutor
             executor_class = OpenVINOExecutor
@@ -1796,7 +1796,7 @@ def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
         if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HabanaExecutor:  # noqa: E721
+            type(self.model_executor) == HPUExecutor:  # noqa: E721
             self.model_executor.start_profile()
         else:
             self.model_executor._run_workers("start_profile")
@@ -1805,7 +1805,7 @@ def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
         if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HabanaExecutor:  # noqa: E721
+            type(self.model_executor) == HPUExecutor:  # noqa: E721
             self.model_executor.stop_profile()
         else:
             self.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 49500099fbcaf..3501f12c065cf 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -23,7 +23,7 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.habana_executor import HabanaExecutor
+from vllm.executor.hpu_executor import HPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -366,14 +366,14 @@ def _alive(self):
 
     def start_profile(self) -> None:
         if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HabanaExecutor:
+                type(self.engine.model_executor) is HPUExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
         if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HabanaExecutor:
+                type(self.engine.model_executor) is HPUExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/hpu_executor.py
similarity index 97%
rename from vllm/executor/habana_executor.py
rename to vllm/executor/hpu_executor.py
index e6d0fbc0d431d..cc5609ebe5c8e 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/hpu_executor.py
@@ -21,7 +21,7 @@
 logger = init_logger(__name__)
 
 
-class HabanaExecutor(ExecutorBase):
+class HPUExecutor(ExecutorBase):
 
     uses_ray: bool = False
 
@@ -57,8 +57,8 @@ def _create_worker(self,
                        rank: int = 0,
                        distributed_init_method: Optional[str] = None):
         wrapper = WorkerWrapperBase(
-            worker_module_name="vllm.worker.habana_worker",
-            worker_class_name="HabanaWorker",
+            worker_module_name="vllm.worker.hpu_worker",
+            worker_class_name="HPUWorker",
         )
         wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
                                                       distributed_init_method))
@@ -202,7 +202,7 @@ def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
 
-class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
+class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
 
     async def execute_model_async(
         self,
diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_hpu_executor.py
similarity index 99%
rename from vllm/executor/ray_habana_executor.py
rename to vllm/executor/ray_hpu_executor.py
index 645bceb1af446..343fa43b0eda1 100644
--- a/vllm/executor/ray_habana_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -29,7 +29,7 @@
 logger = init_logger(__name__)
 
 
-class RayHabanaExecutor(DistributedGPUExecutor):
+class RayHPUExecutor(DistributedGPUExecutor):
 
     uses_ray: bool = True
 
@@ -90,8 +90,8 @@ def _get_worker_module_and_class(
             raise NotImplementedError(
                 "Speculative decoding is not implemented for HPU")
         else:
-            worker_module_name = "vllm.worker.habana_worker"
-            worker_class_name = "HabanaWorker"
+            worker_module_name = "vllm.worker.hpu_worker"
+            worker_class_name = "HPUWorker"
         return (worker_module_name, worker_class_name, worker_class_fn)
 
     def _get_worker_wrapper_args(self) -> Dict[str, Any]:
@@ -479,7 +479,7 @@ def __del__(self):
         self.shutdown()
 
 
-class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync):
+class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/hpu_model_runner.py
similarity index 99%
rename from vllm/worker/habana_model_runner.py
rename to vllm/worker/hpu_model_runner.py
index 2d72be5690664..b1b62e6bde7f6 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -489,7 +489,7 @@ def from_broadcasted_tensor_dict(
         return cls(**tensor_dict)
 
 
-class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
+class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
     """
     Helper class for shared methods between GPU model runners.
     """
@@ -1730,8 +1730,7 @@ def unwrap_model(model):
         return modules
 
 
-class HabanaModelRunner(
-        HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
+class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]):
     """
     GPU model runner with sampling step.
     """
@@ -1872,7 +1871,7 @@ def execute_model(
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError(
-                "num_steps > 1 is not supported in HabanaModelRunner")
+                "num_steps > 1 is not supported in HPUModelRunner")
 
         if self.lora_config:
             assert model_input.lora_requests is not None
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/hpu_worker.py
similarity index 99%
rename from vllm/worker/habana_worker.py
rename to vllm/worker/hpu_worker.py
index 7fc1e48b8c960..59a5adf65ebc1 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -25,14 +25,14 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
 from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.habana_model_runner import HabanaModelRunner
+from vllm.worker.hpu_model_runner import HPUModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase
 from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 logger = init_logger(__name__)
 
 
-class HabanaWorker(LocalOrDistributedWorkerBase):
+class HPUWorker(LocalOrDistributedWorkerBase):
     """A worker class that executes (a partition of) the model on a HPU.
 
     Each worker is associated with a single HPU. The worker is responsible for
@@ -79,7 +79,7 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        self.model_runner: HabanaModelRunner = HabanaModelRunner(
+        self.model_runner: HPUModelRunner = HPUModelRunner(
             model_config,
             parallel_config,
             scheduler_config,

From 76cbbb5deeff3e3d760aff2487f284234c4fd5bb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 19:45:47 +0300
Subject: [PATCH 285/341] Use BF16 on HPU by default

---
 vllm/config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 786ed1586a3ea..b3329f1c449ff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1635,6 +1635,13 @@ def _get_and_verify_dtype(
                     torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
+
+            if current_platform.is_hpu() and config_dtype == torch.float16:
+                logger.info(
+                    "For HPU, we cast models to bfloat16 instead of"
+                    "using float16 by default. Please specify `dtype` if you "
+                    "want to use float16.")
+                torch_dtype = torch.bfloat16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")

From d7d609fb7b99dcab54c851c7f61430345d441a1b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:01:57 +0300
Subject: [PATCH 286/341] Revert "Support loading checkpoints quantized using
 Autofp8 (#286)"

This reverts commit 29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035.
---
 .../layers/fused_moe/fused_moe.py             |  4 --
 .../compressed_tensors/compressed_tensors.py  |  9 ++---
 .../schemes/compressed_tensors_w8a8_fp8.py    |  4 +-
 .../model_executor/layers/quantization/fp8.py | 24 ++++-------
 .../layers/quantization/utils/w8a8_utils.py   | 40 ++++---------------
 vllm/worker/hpu_model_runner.py               |  3 +-
 6 files changed, 22 insertions(+), 62 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 52f748675f752..b1d3bc0a5f054 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -13,10 +13,6 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 logger = init_logger(__name__)
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 670a27d5076bc..abb18d31b5a82 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -245,10 +245,8 @@ def _get_scheme_from_parts(
         # TODO @dsikka: clean-up conditions
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
-                is_fp8_w8a8_supported = current_platform.is_hpu() or \
-                    self._check_scheme_supported(
-                    CompressedTensorsW8A8Fp8.get_min_capability(),
-                    error=False)
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
@@ -320,8 +318,7 @@ def get_scheme(
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)
-        if not current_platform.is_hpu():
-            self._check_scheme_supported(scheme.get_min_capability())
+        self._check_scheme_supported(scheme.get_min_capability())
 
         return scheme
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 29f3228c0dc5d..5931ec36c97d5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -13,7 +13,6 @@
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
-from vllm.platforms import current_platform
 from vllm.utils import is_hip
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
@@ -24,8 +23,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = not current_platform.is_hpu() and \
-                                     cutlass_fp8_supported()
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     @classmethod
     def get_min_capability(cls) -> int:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 88915942220ca..b5feb55db0e74 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -28,10 +28,6 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_hip, print_warning_once
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
 logger = init_logger(__name__)
@@ -120,18 +116,14 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        if current_platform.is_cuda_alike():
-            self.cutlass_fp8_supported = cutlass_fp8_supported()
-
-            # For GPUs that lack FP8 hardware support, we can leverage the
-            # Marlin kernel for fast weight-only FP8 quantization
-            self.use_marlin = (not current_platform.has_device_capability(89)
-                               or envs.VLLM_TEST_FORCE_FP8_MARLIN)
-            # Disable marlin for rocm
-            if is_hip():
-                self.use_marlin = False
-        else:
-            self.cutlass_fp8_supported = False
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if is_hip():
             self.use_marlin = False
 
     def create_weights(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index bf1aa6fbd5dca..411af922149fd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,11 +10,6 @@
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
 
-if current_platform.is_hpu():
-    import habana_frameworks.torch.utils.experimental as htexp
-    from vllm_hpu_extension.ops import scaled_fp8_quant
-    ops.scaled_fp8_quant = scaled_fp8_quant
-
 
 def cutlass_fp8_supported() -> bool:
     # cutlass is not supported on Rocm
@@ -30,15 +25,7 @@ def cutlass_fp8_supported() -> bool:
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
-    dtype = torch.float16
-    device = tensor.device
-    if current_platform.is_hpu():
-        dtype = torch.bfloat16
-        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
-            #dequant on cpu to avoid nan on gaudi2
-            tensor = tensor.to('cpu')
-
-    fake_qweight = tensor.to(dtype).to(device)
+    fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
 
@@ -71,10 +58,7 @@ def requantize_with_max_scale(
         logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
     # Max scale to be used for requanitzation.
     max_w_scale = weight_scale.max()
-    if current_platform.is_hpu() and htexp._get_device_type(
-    ) == htexp.synDeviceType.synDeviceGaudi2:
-        max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
-                                     torch.finfo(torch.float8_e4m3fnuz).max)
+
     # QKV / MLP is fused in the on disk checkpoint if any of the
     # weight scales are still set to the default since we initialize
     # N weight scales for N shards but we only load 1 weight scale
@@ -145,20 +129,12 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            if current_platform.is_hpu():
-                #hpu does not support torch._scaled_mm (SW-197036)
-                output = torch.ops.hpu.fp8_gemm_v2(qinput, False, weight,
-                                                   False, None, input.dtype,
-                                                   x_scale, weight_scale, None,
-                                                   False)
-            else:
-                output = torch._scaled_mm(qinput,
-                                          weight,
-                                          out_dtype=input.dtype,
-                                          scale_a=x_scale,
-                                          scale_b=weight_scale,
-                                          bias=bias)
-
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      out_dtype=input.dtype,
+                                      scale_a=x_scale,
+                                      scale_b=weight_scale,
+                                      bias=bias)
             # A fix for discrepancy in scaled_mm which returns tuple
             # for torch < 2.5 and a single value in torch >= 2.5
             if type(output) is tuple and len(output) == 2:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 47bcda1f753af..72052b0adeb31 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -587,7 +587,8 @@ def _set_gc_threshold(self) -> None:
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        htcore.hpu_set_env()
+        if self.model_config.quantization == 'inc':
+            htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(model_config=self.model_config,

From c07cbc653c03ad75cd0b10eb1c66e01df97acaba Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:03:30 +0300
Subject: [PATCH 287/341] remove lora test

---
 tests/lora/test_lora_manager_hpu.py | 553 ----------------------------
 1 file changed, 553 deletions(-)
 delete mode 100644 tests/lora/test_lora_manager_hpu.py

diff --git a/tests/lora/test_lora_manager_hpu.py b/tests/lora/test_lora_manager_hpu.py
deleted file mode 100644
index ef3bf5272d709..0000000000000
--- a/tests/lora/test_lora_manager_hpu.py
+++ /dev/null
@@ -1,553 +0,0 @@
-import os
-from typing import Dict, List
-
-import pytest
-import torch
-from safetensors.torch import load_file
-from torch import nn
-
-from vllm.config import LoRAConfig
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
-from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager)
-from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
-                                      WorkerLoRAManager)
-from vllm.model_executor.layers.linear import RowParallelLinear
-
-EMBEDDING_MODULES = {
-    "embed_tokens": "input_embeddings",
-    "lm_head": "output_embeddings",
-}
-
-EMBEDDING_PADDING_MODULES = ["lm_head"]
-
-
-def test_from_lora_tensors(sql_lora_files):
-    tensors = load_file(
-        os.path.join(sql_lora_files, "adapter_model.safetensors"))
-    new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
-    lora_model = LoRAModel.from_lora_tensors(
-        1,
-        8,
-        16,
-        tensors,
-        "hpu",
-        embeddings=new_embeddings,
-        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
-    for module_name, lora in lora_model.loras.items():
-        assert lora.module_name == module_name
-        assert lora.rank == 8
-        assert lora.lora_alpha == 16
-        assert lora.lora_a is not None
-        assert lora.lora_b is not None
-        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
-                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
-        assert lora.lora_a.shape[1] == 8
-        embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None)
-        if embeddings_module:
-            assert torch.equal(
-                lora.embeddings_tensor,
-                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device))
-        else:
-            assert lora.embeddings_tensor is None
-
-
-def create_lora(lora_id: int, model: nn.Module,
-                sub_modules: List[str]) -> LoRAModel:
-    loras: Dict[str, LoRALayerWeights] = {}
-    for name in sub_modules:
-        w = model.get_submodule(name).weight
-        loras[name] = LoRALayerWeights(
-            name,
-            8,
-            16,
-            torch.rand([w.shape[1], 8], device="hpu"),
-            torch.rand([8, w.shape[0]], device="hpu"),
-        )
-    return LoRAModel(lora_id, 8, loras)
-
-
-def create_packed_lora(
-    lora_id: int,
-    model: nn.Module,
-    module_name,
-    replaced_module_names,
-    empty_replaced_module_name=None,
-) -> LoRAModel:
-    w = model.get_submodule(module_name).weight
-    loras: Dict[str, LoRALayerWeights] = {}
-    for replaced_module_name in replaced_module_names:
-        if replaced_module_name == empty_replaced_module_name:
-            continue
-        loras[replaced_module_name] = LoRALayerWeights(
-            replaced_module_name,
-            8,
-            16,
-            torch.rand([w.shape[1], 8], device="hpu"),
-            torch.rand([8, w.shape[0] // len(replaced_module_names)],
-                       device="hpu"),
-        )
-    return LoRAModel(lora_id, 8, loras)
-
-
-def test_replace_submodules(dist_init, dummy_model):
-    model = dummy_model
-    model.supported_lora_modules = ["dense1", "layer1.dense2"]
-    model.packed_modules_mapping = {}
-    manager = LoRAModelManager(
-        model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
-    model = manager.model
-
-    assert isinstance(model.get_submodule("dense1"),
-                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense1"),
-                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
-    assert isinstance(model.get_submodule("layer1.dense2"),
-                      RowParallelLinearWithLoRA)
-
-
-def test_lora_model_manager(dist_init, dummy_model):
-    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    manager = LoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
-    assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_adapter(model_lora1)
-    assert manager.activate_adapter(1)
-    assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_adapter(model_lora1)
-    assert not manager.activate_adapter(1)
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(2)
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_adapter(model_lora2)
-    assert not manager.activate_adapter(2)
-    assert manager.add_adapter(model_lora3)
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-    with pytest.raises(ValueError):
-        assert manager.activate_adapter(3)
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_adapter(model_lora2.id)
-    assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_adapter(model_lora2.id)
-    assert manager.remove_adapter(model_lora1.id)
-    assert not manager.remove_adapter(model_lora1.id)
-    assert manager.add_adapter(model_lora1)
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] is None
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(3)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] is None
-    assert manager.activate_adapter(2)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 2
-
-
-def test_lora_lru_cache_model_manager(dist_init, dummy_model):
-    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    manager = LRUCacheLoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
-    assert all(x is None for x in manager.lora_index_to_id)
-    assert manager.add_adapter(model_lora1)
-    assert manager.activate_adapter(1)
-    assert manager.lora_index_to_id[0] == 1
-    assert not manager.add_adapter(model_lora1)
-    assert not manager.activate_adapter(1)
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(2)
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-    assert not manager.add_adapter(model_lora2)
-    assert not manager.activate_adapter(2)
-    assert manager.add_adapter(model_lora3)
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-    assert manager.activate_adapter(3)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 2
-    assert manager.remove_adapter(model_lora2.id)
-    assert manager.lora_index_to_id[1] is None
-    assert not manager.remove_adapter(model_lora2.id)
-    assert manager.remove_adapter(model_lora1.id)
-    assert not manager.remove_adapter(model_lora1.id)
-    assert manager.add_adapter(model_lora1)
-    assert manager.activate_adapter(1)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.add_adapter(model_lora2)
-    assert manager.deactivate_adapter(3)
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_adapter(2)
-    assert manager.lora_index_to_id[0] == 2
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_adapter(3)
-    assert manager.lora_index_to_id[0] == 2
-    assert manager.lora_index_to_id[1] == 3
-    assert manager.pin_adapter(2)
-    assert manager.lora_index_to_id[0] == 2
-    assert manager.lora_index_to_id[1] == 3
-    assert manager.activate_adapter(1)
-    assert manager.lora_index_to_id[0] == 2
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.deactivate_adapter(2)
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.activate_adapter(3)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.pin_adapter(3)
-    assert manager.pin_adapter(1)
-    with pytest.raises(RuntimeError):
-        assert manager.pin_adapter(2)
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 1
-    with pytest.raises(RuntimeError):
-        assert manager.activate_adapter(2)
-
-    assert manager.deactivate_adapter(3)
-    assert manager.pin_adapter(2)
-    assert manager.lora_index_to_id[0] == 2
-    assert manager.lora_index_to_id[1] == 1
-    assert manager.remove_adapter(3)
-    with pytest.raises(ValueError):
-        assert manager.pin_adapter(3)
-
-
-def test_lru_lora_model_manager(dist_init, dummy_model):
-    # This tests just the LRU cache functionality, everything else is
-    # tested in test_lora_model_manager
-    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
-    manager = LRUCacheLoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
-
-    assert all(x is None for x in manager.lora_index_to_id)
-
-    # Add up to capacity
-    assert manager.add_adapter(model_lora1)
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(1)
-    assert manager.activate_adapter(2)
-
-    assert set(manager.list_adapters()) == {1, 2}
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-
-    # Add over capacity
-    assert manager.add_adapter(model_lora3)
-    assert manager.add_adapter(model_lora4)
-    assert manager.activate_adapter(3)
-    assert manager.activate_adapter(4)
-
-    assert set(manager.list_adapters()) == {3, 4}
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 4
-
-    # Add 3 again to move it to the top and then add 2
-    # should return false since it's in already
-    assert not manager.add_adapter(model_lora3)
-    assert not manager.activate_adapter(3)
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(2)
-
-    assert set(manager.list_adapters()) == {3, 2}
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 2
-
-    # Remove manually
-    assert manager.remove_adapter(3)
-    assert not manager.remove_adapter(3)
-
-    assert set(manager.list_adapters()) == {2}
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] == 2
-
-    assert manager.add_adapter(model_lora3)
-    assert manager.activate_adapter(3)
-    assert manager.add_adapter(model_lora4)
-    assert manager.activate_adapter(4)
-
-    assert set(manager.list_adapters()) == {3, 4}
-    assert manager.lora_index_to_id[0] == 3
-    assert manager.lora_index_to_id[1] == 4
-
-    assert manager.remove_oldest_adapter()
-    assert set(manager.list_adapters()) == {4}
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] == 4
-
-    assert manager.remove_oldest_adapter()
-    assert set(manager.list_adapters()) == set()
-    assert all(x is None for x in manager.lora_index_to_id)
-
-    assert not manager.remove_oldest_adapter()
-    assert set(manager.list_adapters()) == set()
-    assert all(x is None for x in manager.lora_index_to_id)
-
-    # pinning
-    assert manager.add_adapter(model_lora3)
-    assert manager.activate_adapter(3)
-    assert manager.add_adapter(model_lora4)
-    assert manager.activate_adapter(4)
-    assert set(manager.list_adapters()) == {3, 4}
-    with pytest.raises(ValueError):
-        assert manager.pin_adapter(1)
-    assert manager.pin_adapter(3)
-    # Remove manually
-    assert manager.remove_adapter(3)
-    assert not manager.remove_adapter(3)
-
-    assert set(manager.list_adapters()) == {4}
-    assert manager.lora_index_to_id[0] is None
-    assert manager.lora_index_to_id[1] == 4
-
-    assert manager.add_adapter(model_lora1)
-    assert manager.pin_adapter(1)
-    assert manager.add_adapter(model_lora2)
-    assert manager.activate_adapter(2)
-
-    assert set(manager.list_adapters()) == {1, 2}
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] == 2
-
-    assert manager.remove_oldest_adapter()
-    assert set(manager.list_adapters()) == {1}
-    assert manager.lora_index_to_id[0] == 1
-    assert manager.lora_index_to_id[1] is None
-
-    with pytest.raises(RuntimeError):
-        assert manager.remove_oldest_adapter()
-
-    assert set(manager.list_adapters()) == {1}
-
-
-def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                          sql_lora_files):
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"),
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings.model)
-
-    mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6
-
-    # Over capacity
-    with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
-        ], mapping)
-
-
-def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                sql_lora_files):
-    # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
-    worker_adapter_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"),
-        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
-    worker_adapter_manager.create_lora_manager(
-        llama_2_7b_model_extra_embeddings.model)
-
-    mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("3", 3, sql_lora_files),
-        LoRARequest("4", 4, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("2", 2, sql_lora_files),
-        LoRARequest("5", 5, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files),
-        LoRARequest("1", 1, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {1}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
-
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, sql_lora_files),
-        LoRARequest("7", 7, sql_lora_files),
-        LoRARequest("8", 8, sql_lora_files)
-    ], mapping)
-    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
-    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7
-
-    # Over capacity
-    with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, sql_lora_files),
-            LoRARequest("11", 11, sql_lora_files),
-            LoRARequest("12", 12, sql_lora_files),
-            LoRARequest("13", 13, sql_lora_files),
-            LoRARequest("14", 14, sql_lora_files)
-        ], mapping)
-
-
-def test_packed_loras(dist_init, dummy_model_gate_up):
-    model = dummy_model_gate_up
-    model.supported_lora_modules = ["gate_up_proj"]
-    model.packed_modules_mapping = {
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-    model_lora = create_packed_lora(
-        1,
-        model,
-        module_name="gate_up_proj",
-        replaced_module_names=["gate_proj", "up_proj"])
-    model_lora1 = create_packed_lora(
-        2,
-        model,
-        module_name="gate_up_proj",
-        replaced_module_names=["gate_proj", "up_proj"],
-        empty_replaced_module_name="gate_proj",
-    )
-
-    manager = LoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
-    model = manager.model
-
-    assert isinstance(model.get_submodule("gate_up_proj"),
-                      MergedColumnParallelLinearWithLoRA)
-    assert manager.add_adapter(model_lora)
-    assert manager.add_adapter(model_lora1)
-
-    packed_lora = model_lora.get_lora("gate_up_proj")
-    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
-
-    assert torch.allclose(packed_lora.lora_a[0],
-                          model_lora.get_lora("gate_proj").lora_a)
-    assert torch.allclose(packed_lora.lora_b[0],
-                          model_lora.get_lora("gate_proj").lora_b)
-    assert torch.allclose(packed_lora.lora_a[1],
-                          model_lora.get_lora("up_proj").lora_a)
-    assert torch.allclose(packed_lora.lora_b[1],
-                          model_lora.get_lora("up_proj").lora_b)
-
-    packed_lora1 = model_lora1.get_lora("gate_up_proj")
-    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
-
-    assert packed_lora1.lora_a[0] is None
-    assert packed_lora1.lora_b[0] is None
-    assert torch.allclose(packed_lora1.lora_a[1],
-                          model_lora1.get_lora("up_proj").lora_a)
-    assert torch.allclose(packed_lora1.lora_b[1],
-                          model_lora1.get_lora("up_proj").lora_b)

From d90bbce3d3c40059bfe9b7950c431be57f0ebd80 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:07:21 +0300
Subject: [PATCH 288/341] revert FP8 changes

---
 vllm/model_executor/layers/linear.py          |  10 +-
 .../layers/quantization/__init__.py           |   2 -
 .../model_executor/layers/quantization/inc.py | 119 ------------------
 vllm/model_executor/model_loader/utils.py     |   2 +-
 4 files changed, 3 insertions(+), 130 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/inc.py

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 44d9a643613bc..568892778abe2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -285,7 +285,6 @@ def __init__(self,
                          quant_config, prefix)
 
         self.gather_output = gather_output
-        self.collective_func = tensor_model_parallel_all_gather
 
         # Divide the weight matrix along the last dimension.
         tp_size = get_tensor_model_parallel_world_size()
@@ -368,7 +367,7 @@ def forward(self, input_):
         output_parallel = self.quant_method.apply(self, input_, bias)
         if self.gather_output:
             # All-gather across the partitions.
-            output = self.collective_func(output_parallel)
+            output = tensor_model_parallel_all_gather(output_parallel)
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
@@ -974,7 +973,6 @@ def __init__(self,
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
-        self.collective_func = tensor_model_parallel_all_reduce
 
         # Divide the weight matrix along the last dimension.
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -1053,7 +1051,7 @@ def weight_loader_v2(self, param: BasevLLMParameter,
 
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
-    def resolve_input(self, input_):
+    def forward(self, input_):
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -1061,10 +1059,6 @@ def resolve_input(self, input_):
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size)
             input_parallel = splitted_input[tp_rank].contiguous()
-        return input_parallel
-
-    def forward(self, input_):
-        input_parallel = self.resolve_input(input_)
 
         # Matrix multiply.
         assert self.quant_method is not None
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 8f57f17470009..3c38f0a006070 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -21,7 +21,6 @@
     GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQMarlin24Config)
-from vllm.model_executor.layers.quantization.inc import INCConfig
 from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config
 from vllm.model_executor.layers.quantization.neuron_quant import (
@@ -47,7 +46,6 @@
     "gptq": GPTQConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
-    "inc": INCConfig,
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
     "neuron_quant": NeuronQuantConfig,
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
deleted file mode 100644
index ec0141b61f58f..0000000000000
--- a/vllm/model_executor/layers/quantization/inc.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from typing import Any, Dict, List, Optional
-
-import torch
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-
-from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, UnquantizedFusedMoEMethod)
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.utils import set_weight_attrs
-
-ACTIVATION_SCHEMES = ["static", "dynamic"]
-
-logger = init_logger(__name__)
-
-
-class INCConfig(QuantizationConfig):
-    """Config class for FP8."""
-
-    def __init__(
-        self,
-        is_checkpoint_fp8_serialized: bool = False,
-        activation_scheme: str = "dynamic",
-    ) -> None:
-        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-        if is_checkpoint_fp8_serialized:
-            logger.warning("Detected fp8 checkpoint. Please note that the "
-                           "format is experimental and subject to change.")
-        if activation_scheme not in ACTIVATION_SCHEMES:
-            raise ValueError(
-                f"Unsupported activation scheme {activation_scheme}")
-        self.activation_scheme = activation_scheme
-
-    @classmethod
-    def get_name(cls) -> str:
-        return "inc"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16]
-
-    @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "INCConfig":
-        quant_method = cls.get_from_keys(config, ["quant_method"])
-        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
-                   activation_scheme=activation_scheme)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["INCLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return INCLinearMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return UnquantizedFusedMoEMethod()
-        return None
-
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        # The AWQ kernel only supports Turing or newer GPUs.
-        return 75
-
-    @staticmethod
-    def get_config_filenames() -> List[str]:
-        return []
-
-
-class INCLinearMethod(LinearMethodBase):
-    """Linear method for FP8.
-    Supports loading FP8 checkpoints with static weight scale and
-    dynamic/static activation scale.
-    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
-    activation scaling. The weight scaling factor will be initialized after
-    the model weights are loaded.
-    Limitations:
-    1. Only support per-tensor quantization due to torch._scaled_mm support.
-    2. Only support float8_e4m3fn data type due to the limitation of
-       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
-
-    Args:
-        quant_config: The quantization config.
-    """
-
-    def __init__(self,
-                 quant_config: INCConfig,
-                 separate_bias_add: bool = False):
-        self.separate_bias_add = separate_bias_add
-        self.quant_config = quant_config
-
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-        output_size_per_partition = sum(output_partition_sizes)
-        weight = Parameter(torch.empty(output_size_per_partition,
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
-        layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, extra_weight_attrs)
-
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        weight = layer.weight
-        if self.separate_bias_add:
-            if bias is not None:
-                return F.linear(x, weight) + bias
-            return F.linear(x, weight)
-        return F.linear(x, weight, bias)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 6cb4e30e92511..2bfe6ea09bd62 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -23,7 +23,7 @@ def get_model_architecture(
     architectures = getattr(model_config.hf_config, "architectures", [])
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin", "inc"]
+    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin"]
 
     if (model_config.quantization is not None
             and model_config.quantization not in mixtral_supported

From 84dc6c502dd6181300ce6bd3c4635488e6224031 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:10:26 +0300
Subject: [PATCH 289/341] remove leftover fp8 code

---
 vllm/engine/arg_utils.py | 5 ++---
 vllm/utils.py            | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c9adf85189be7..848c1fece6e9d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -299,12 +299,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--kv-cache-dtype',
             type=str,
-            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
+            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
             default=EngineArgs.kv_cache_dtype,
             help='Data type for kv cache storage. If "auto", will use model '
             'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3). '
-            'Intel Gaudi (HPU) supports fp8 (using fp8_inc).')
+            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
         parser.add_argument(
             '--quantization-param-path',
             type=nullable_str,
diff --git a/vllm/utils.py b/vllm/utils.py
index 525fcc0ea719c..0ec2263514771 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -132,7 +132,6 @@
     "fp8": torch.uint8,
     "fp8_e4m3": torch.uint8,
     "fp8_e5m2": torch.uint8,
-    "fp8_inc": torch.float8_e4m3fn,
 }
 
 TORCH_DTYPE_TO_NUMPY_DTYPE = {

From f7288de98904db9797e53704ad33eea76b0ebdc0 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:14:08 +0300
Subject: [PATCH 290/341] remove weights_load_device stuff

---
 vllm/engine/arg_utils.py                   | 13 +------------
 vllm/engine/llm_engine.py                  |  3 +--
 vllm/model_executor/model_loader/loader.py |  8 +++-----
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 848c1fece6e9d..e3b9ce25da35a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -91,7 +91,6 @@ class EngineArgs:
     trust_remote_code: bool = False
     download_dir: Optional[str] = None
     load_format: str = 'auto'
-    weights_load_device: Optional[str] = None
     config_format: str = 'auto'
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
@@ -268,12 +267,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'section for more information.\n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
             'quantization.\n')
-        parser.add_argument("--weights-load-device",
-                            type=str,
-                            default=EngineArgs.weights_load_device,
-                            choices=DEVICE_OPTIONS,
-                            help=('Device to which model weights '
-                                  'will be loaded.'))
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,
@@ -870,14 +863,10 @@ def create_model_config(self) -> ModelConfig:
             mm_processor_kwargs=self.mm_processor_kwargs,
         )
 
-    def create_load_config(self, load_device=None) -> LoadConfig:
-        if load_device is None:
-            dummy_device_config = DeviceConfig(device=self.device)
-            load_device = dummy_device_config.device
+    def create_load_config(self) -> LoadConfig:
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
-            device=load_device,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
         )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 89a36adcf5c1a..1fd61825993bb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -245,7 +245,7 @@ def __init__(
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
             "pipeline_parallel_size=%d, "
             "disable_custom_all_reduce=%s, quantization=%s, "
-            "weights_load_device=%s, enforce_eager=%s, kv_cache_dtype=%s, "
+            "enforce_eager=%s, kv_cache_dtype=%s, "
             "quantization_param_path=%s, device_config=%s, "
             "decoding_config=%r, observability_config=%r, "
             "seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
@@ -273,7 +273,6 @@ def __init__(
             parallel_config.pipeline_parallel_size,
             parallel_config.disable_custom_all_reduce,
             model_config.quantization,
-            load_config.device,
             model_config.enforce_eager,
             cache_config.cache_dtype,
             model_config.quantization_param_path,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 0169490d3408a..8d4163ec88490 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -60,7 +60,7 @@ def device_loading_context(module: torch.nn.Module,
 
     # Store original device states and move parameters to GPU if they're on CPU
     for name, p in module.named_parameters():
-        if p.device.type == "cpu" and target_device.type != 'hpu':
+        if p.device.type == "cpu":
             original_device_states[name] = p.device
             p.data = p.data.to(target_device)
         # Parameters already on target device are not touched
@@ -394,13 +394,11 @@ def load_model(self, *, model_config: ModelConfig,
                    cache_config: CacheConfig) -> nn.Module:
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            load_device : torch.device = self.load_config.device if \
-                self.load_config.device is not None else target_device
-            with load_device:
+            with target_device:
                 model = _initialize_model(model_config, self.load_config,
                                           lora_config, cache_config,
                                           scheduler_config)
-            logger.info("Loading weights on %s...", target_device)
+
             model.load_weights(self._get_all_weights(model_config, model))
 
             for _, module in model.named_modules():

From 6899c3f1cbc17df234cc224726e0f3667c908d10 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:15:33 +0300
Subject: [PATCH 291/341] remove weights_load_device

---
 vllm/engine/arg_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e3b9ce25da35a..cd86f23a4f471 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1053,9 +1053,7 @@ def create_engine_config(self) -> EngineConfig:
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
-        load_device = device_config.device if self.weights_load_device is \
-            None else self.weights_load_device
-        load_config = self.create_load_config(load_device)
+        load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(
             max_prompt_adapters=self.max_prompt_adapters,

From e5d640eef048af37adbb32b65b5c7fee8b0609fa Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:17:26 +0300
Subject: [PATCH 292/341] fp8 leftovers

---
 vllm/config.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 786ed1586a3ea..ba85c6910cab5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -621,13 +621,12 @@ def _verify_args(self) -> None:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"):
+        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
                 "Meanwhile, it may cause accuracy drop without a proper "
-                "scaling factor. "
-                "Intel Gaudi (HPU) supports fp8 (using fp8_inc).")
+                "scaling factor")
         else:
             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
 
@@ -746,13 +745,10 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
-        device: Device to which model weights will be loaded, default to
-            device_config.device
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
     download_dir: Optional[str] = None
-    device: Optional[str] = None
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None

From 25388e213c0e29ee3d5633090112c2a2b4c28cb3 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 19:28:28 +0200
Subject: [PATCH 293/341] Update vllm/model_executor/layers/logits_processor.py

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/layers/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index bee3d38565f4c..4f3544e36a283 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -109,7 +109,7 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup,
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios (warmup,
     # profile_run) we might not have selected_token_indices, so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:
         return hidden_states.index_select(

From b4f7ffa4d5024547f5be282d68ab8a415c99cf0a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:37:25 +0300
Subject: [PATCH 294/341] Rename HabanaAttention -> HPUAttention

---
 vllm/attention/backends/habana_attn.py  | 264 ------------------------
 vllm/attention/ops/habana_paged_attn.py | 102 ---------
 vllm/attention/selector.py              |  13 +-
 3 files changed, 6 insertions(+), 373 deletions(-)
 delete mode 100644 vllm/attention/backends/habana_attn.py
 delete mode 100644 vllm/attention/ops/habana_paged_attn.py

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
deleted file mode 100644
index dad33fefc51f3..0000000000000
--- a/vllm/attention/backends/habana_attn.py
+++ /dev/null
@@ -1,264 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-import os
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
-
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
-                                                  HabanaPagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class HabanaAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_impl_cls() -> Type["HabanaAttentionImpl"]:
-        return HabanaAttentionImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return HabanaAttentionMetadata
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                       num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache,
-                                         src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
-    ) -> None:
-        HabanaPagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-
-@dataclass
-class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata):
-    """Metadata for HabanaAttentionbackend."""
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    is_prompt: bool
-    attn_bias: Optional[torch.Tensor]
-    seq_lens_tensor: Optional[torch.Tensor]
-
-
-class HabanaAttentionImpl(AttentionImpl, torch.nn.Module):
-    """
-    If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prefill_tokens ----------------->|
-    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
-
-    Otherwise, the layout is as follows:
-    |<----------------- num_decode_tokens ------------------>|
-    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
-
-    Generation tokens can contain padding when cuda-graph is used.
-    Currently, prompt tokens don't contain any padding.
-
-    The prompts might have different lengths, while the generation tokens
-    always have length 1.
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len: int = 4096,
-    ) -> None:
-        super(AttentionImpl, self).__init__()
-        self.kv_cache_dtype = kv_cache_dtype
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.matmul_qk = Matmul()
-        self.softmax = Softmax()
-        self.matmul_av = Matmul()
-        self.k_cache = VLLMKVCache()
-        self.v_cache = VLLMKVCache()
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.sliding_window = sliding_window
-        self.alibi_slopes = alibi_slopes
-        if alibi_slopes is not None:
-            alibi_slopes_tensor = torch.tensor(alibi_slopes,
-                                               dtype=torch.bfloat16)
-            self.alibi_slopes = alibi_slopes_tensor
-        assert self.num_heads % self.num_kv_heads == 0
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                              '0').lower() in ['1', 'true']
-        if self.prefill_usefusedsdpa:
-            assert alibi_slopes is None, \
-                'Prefill with FusedSDPA not supported with alibi slopes!'
-
-        suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
-
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: HabanaAttentionMetadata,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
-    ) -> torch.Tensor:
-        """Forward pass with xFormers and PagedAttention.
-
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "HabanaAttentionImpl")
-        batch_size, seq_len, hidden_size = query.shape
-        _, seq_len_kv, _ = key.shape
-
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        block_indices = attn_metadata.block_indices
-        block_offsets = attn_metadata.block_offsets
-        if attn_metadata.is_prompt:
-            key = key.unflatten(0, (block_indices.size(0), -1))
-            value = value.unflatten(0, (block_indices.size(0), -1))
-        if kv_cache is not None:
-            key_cache, value_cache = HabanaPagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            key_cache = self.k_cache(key, key_cache, block_indices,
-                                     block_offsets)
-            value_cache = self.v_cache(value, value_cache, block_indices,
-                                       block_offsets)
-
-        if attn_metadata.is_prompt:
-            # Prompt run.
-            if not self.prefill_usefusedsdpa:
-                # TODO: move this outside of model
-                assert attn_metadata.attn_bias is not None, \
-                        'attn_bias must be set before calling model.forward!'
-                attn_bias = attn_metadata.attn_bias
-                if self.alibi_slopes is not None:
-                    position_bias = _make_alibi_bias(self.alibi_slopes,
-                                                     self.num_kv_heads,
-                                                     attn_bias.dtype,
-                                                     attn_bias.shape[-1])
-                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
-                    attn_bias.add_(position_bias)
-            else:
-                attn_bias = None
-
-            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
-            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
-                        self.head_size)
-            out = ops.prompt_attention(
-                query.view(query_shape),
-                key.view(kv_shape),
-                value.view(kv_shape),
-                attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-                matmul_qk_op=self.matmul_qk,
-                softmax_op=self.softmax,
-                matmul_av_op=self.matmul_av,
-            )
-            output = out.reshape(batch_size, seq_len, hidden_size)
-        else:
-            # Decoding run.
-            output = HabanaPagedAttention.forward_decode(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_list=attn_metadata.block_list,
-                block_mapping=attn_metadata.block_mapping,
-                block_bias=attn_metadata.attn_bias,
-                scale=self.scale,
-                matmul_qk_op=self.matmul_qk,
-                matmul_av_op=self.matmul_av,
-                keys_fetch_func=self.k_cache.fetch_from_cache,
-                values_fetch_func=self.v_cache.fetch_from_cache)
-        # Reshape the output tensor.
-        return output.view(batch_size, seq_len, hidden_size)
-
-
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_len: int,
-) -> torch.Tensor:
-    bias = torch.arange(seq_len, dtype=dtype)
-    # NOTE(zhuohan): HF uses
-    #     `bias = bias[None, :].repeat(seq_len, 1)`
-    # here. We find that both biases give the same results, but
-    # the bias below more accurately follows the original ALiBi
-    # paper.
-    # Calculate a matrix where each element represents ith element- jth
-    # element.
-    bias = bias[None, :] - bias[:, None]
-
-    padded_len = (seq_len + 7) // 8 * 8
-    num_heads = alibi_slopes.shape[0]
-    bias = torch.empty(
-        1,  # batch size
-        num_heads,
-        seq_len,
-        padded_len,
-        device=alibi_slopes.device,
-        dtype=dtype,
-    )[:, :, :, :seq_len].copy_(bias)
-    bias.mul_(alibi_slopes[:, None, None])
-    if num_heads != num_kv_heads:
-        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
-    return bias
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
deleted file mode 100644
index 7f080e0727457..0000000000000
--- a/vllm/attention/ops/habana_paged_attn.py
+++ /dev/null
@@ -1,102 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import torch
-from vllm_hpu_extension import cache_ops, ops
-
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-@dataclass
-class HabanaPagedAttentionMetadata:
-    """Metadata for PagedAttention."""
-    block_list: Optional[torch.Tensor]
-    block_mapping: Optional[torch.Tensor]
-    block_usage: Optional[torch.Tensor]
-    block_indices: Optional[torch.Tensor]
-    block_offsets: Optional[torch.Tensor]
-
-
-class HabanaPagedAttention:
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 256]
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        key_cache = kv_cache[0]
-        value_cache = kv_cache[1]
-        return key_cache, value_cache
-
-    @staticmethod
-    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
-                             key_cache: torch.Tensor,
-                             value_cache: torch.Tensor,
-                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
-                             is_prompt: bool) -> None:
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                    slot_mapping, kv_cache_dtype, is_prompt)
-
-    @staticmethod
-    def forward_decode(**kwargs) -> torch.Tensor:
-        return ops.flat_pa(**kwargs)
-
-    @staticmethod
-    def forward_prefix(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        subquery_start_loc: torch.Tensor,
-        seq_lens_tensor: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_query_len: int,
-        alibi_slopes: Optional[torch.Tensor],
-        sliding_window: Optional[int],
-    ) -> torch.Tensor:
-        raise NotImplementedError(
-            "forward_prefix is not implemented for HabanaPagedAttention")
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: Dict[int, List[int]],
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index c7a416a78519b..52d3dfa820752 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -22,7 +22,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
-    HABANA_ATTN = enum.auto()
+    HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
 
@@ -143,11 +143,10 @@ def get_attn_backend(
         logger.info("Using Flashinfer backend.")
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
-    elif backend == _Backend.HABANA_ATTN:
-        logger.info("Using HabanaAttention backend.")
-        from vllm.attention.backends.habana_attn import (  # noqa: F401
-            HabanaAttentionBackend)
-        return HabanaAttentionBackend
+    elif backend == _Backend.HPU_ATTN:
+        logger.info("Using HPUAttention backend.")
+        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
+        return HPUAttentionBackend
     elif backend == _Backend.PALLAS:
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
@@ -217,7 +216,7 @@ def which_attn_to_use(
         return _Backend.ROCM_FLASH
 
     if current_platform.is_hpu():
-        return _Backend.HABANA_ATTN
+        return _Backend.HPU_ATTN
 
     # FlashAttn in NVIDIA GPUs.
     if selected_backend == _Backend.FLASH_ATTN:

From 43959db5f463cb6da5ce3a36200354072ca12380 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:39:01 +0300
Subject: [PATCH 295/341] oopsie

---
 vllm/attention/backends/hpu_attn.py  | 264 +++++++++++++++++++++++++++
 vllm/attention/ops/hpu_paged_attn.py | 102 +++++++++++
 2 files changed, 366 insertions(+)
 create mode 100644 vllm/attention/backends/hpu_attn.py
 create mode 100644 vllm/attention/ops/hpu_paged_attn.py

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
new file mode 100644
index 0000000000000..c95bf29e0f8be
--- /dev/null
+++ b/vllm/attention/backends/hpu_attn.py
@@ -0,0 +1,264 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
+                                                  HPUPagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class HPUAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["HPUAttentionImpl"]:
+        return HPUAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return HPUAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                       num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache,
+                                         src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        HPUPagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
+    """Metadata for HPUAttentionbackend."""
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    attn_bias: Optional[torch.Tensor]
+    seq_lens_tensor: Optional[torch.Tensor]
+
+
+class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        max_seq_len: int = 4096,
+    ) -> None:
+        super(AttentionImpl, self).__init__()
+        self.kv_cache_dtype = kv_cache_dtype
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.matmul_qk = Matmul()
+        self.softmax = Softmax()
+        self.matmul_av = Matmul()
+        self.k_cache = VLLMKVCache()
+        self.v_cache = VLLMKVCache()
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        self.alibi_slopes = alibi_slopes
+        if alibi_slopes is not None:
+            alibi_slopes_tensor = torch.tensor(alibi_slopes,
+                                               dtype=torch.bfloat16)
+            self.alibi_slopes = alibi_slopes_tensor
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                              '0').lower() in ['1', 'true']
+        if self.prefill_usefusedsdpa:
+            assert alibi_slopes is None, \
+                'Prefill with FusedSDPA not supported with alibi slopes!'
+
+        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: HPUAttentionMetadata,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+        batch_size, seq_len, hidden_size = query.shape
+        _, seq_len_kv, _ = key.shape
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        block_indices = attn_metadata.block_indices
+        block_offsets = attn_metadata.block_offsets
+        if attn_metadata.is_prompt:
+            key = key.unflatten(0, (block_indices.size(0), -1))
+            value = value.unflatten(0, (block_indices.size(0), -1))
+        if kv_cache is not None:
+            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            key_cache = self.k_cache(key, key_cache, block_indices,
+                                     block_offsets)
+            value_cache = self.v_cache(value, value_cache, block_indices,
+                                       block_offsets)
+
+        if attn_metadata.is_prompt:
+            # Prompt run.
+            if not self.prefill_usefusedsdpa:
+                # TODO: move this outside of model
+                assert attn_metadata.attn_bias is not None, \
+                        'attn_bias must be set before calling model.forward!'
+                attn_bias = attn_metadata.attn_bias
+                if self.alibi_slopes is not None:
+                    position_bias = _make_alibi_bias(self.alibi_slopes,
+                                                     self.num_kv_heads,
+                                                     attn_bias.dtype,
+                                                     attn_bias.shape[-1])
+                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
+                    attn_bias.add_(position_bias)
+            else:
+                attn_bias = None
+
+            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
+                        self.head_size)
+            out = ops.prompt_attention(
+                query.view(query_shape),
+                key.view(kv_shape),
+                value.view(kv_shape),
+                attn_bias=attn_bias,
+                p=0.0,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                softmax_op=self.softmax,
+                matmul_av_op=self.matmul_av,
+            )
+            output = out.reshape(batch_size, seq_len, hidden_size)
+        else:
+            # Decoding run.
+            output = HPUPagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_list=attn_metadata.block_list,
+                block_mapping=attn_metadata.block_mapping,
+                block_bias=attn_metadata.attn_bias,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                matmul_av_op=self.matmul_av,
+                keys_fetch_func=self.k_cache.fetch_from_cache,
+                values_fetch_func=self.v_cache.fetch_from_cache)
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    seq_len: int,
+) -> torch.Tensor:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(seq_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    # Calculate a matrix where each element represents ith element- jth
+    # element.
+    bias = bias[None, :] - bias[:, None]
+
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        1,  # batch size
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    return bias
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
new file mode 100644
index 0000000000000..7fbe26d83f320
--- /dev/null
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -0,0 +1,102 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from vllm_hpu_extension import cache_ops, ops
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class HPUPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
+    block_indices: Optional[torch.Tensor]
+    block_offsets: Optional[torch.Tensor]
+
+
+class HPUPagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        key_cache = kv_cache[0]
+        value_cache = kv_cache[1]
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
+                             key_cache: torch.Tensor,
+                             value_cache: torch.Tensor,
+                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
+                             is_prompt: bool) -> None:
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping, kv_cache_dtype, is_prompt)
+
+    @staticmethod
+    def forward_decode(**kwargs) -> torch.Tensor:
+        return ops.flat_pa(**kwargs)
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "forward_prefix is not implemented for HPUPagedAttention")
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)

From b8404ad8a20c4c58e7b6562462d7bb56fcdf7ffc Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:40:11 +0300
Subject: [PATCH 296/341] format.sh

---
 vllm/attention/backends/hpu_attn.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index c95bf29e0f8be..17201fe6e1cd6 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -14,7 +14,7 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
-                                                  HPUPagedAttentionMetadata)
+                                               HPUPagedAttentionMetadata)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -42,7 +42,7 @@ def get_kv_cache_shape(
         head_size: int,
     ) -> Tuple[int, ...]:
         return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                       num_kv_heads, head_size)
+                                                    num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -50,8 +50,7 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: Dict[int, int],
     ) -> None:
-        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache,
-                                         src_to_dst)
+        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(

From d38564f05226827cbd7f319ea1344994da030248 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:45:37 +0300
Subject: [PATCH 297/341] fix comment length

---
 vllm/model_executor/layers/logits_processor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 4f3544e36a283..4a8e8d9179981 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -109,8 +109,9 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios (warmup,
-    # profile_run) we might not have selected_token_indices, so we skip pruning.
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios 
+    # (warmup, profile_run) we might not have selected_token_indices,
+    # so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:
         return hidden_states.index_select(
             0, sampling_metadata.selected_token_indices)

From 33c1db02941c336d3ba25f2579a2f9a3c2a2384f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 20:50:27 +0300
Subject: [PATCH 298/341] fix comment

---
 vllm/model_executor/layers/logits_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 4a8e8d9179981..bdfef67963e93 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -109,7 +109,7 @@ def _prune_hidden_states(
     hidden_states: torch.Tensor,
     sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios 
+    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
     # (warmup, profile_run) we might not have selected_token_indices,
     # so we skip pruning.
     if sampling_metadata.selected_token_indices is not None:

From 05777e02cbfe6651c53f0b55700375549e87ef82 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 4 Oct 2024 21:02:22 +0300
Subject: [PATCH 299/341] Lazily import HPU-dependent components

---
 vllm/executor/hpu_executor.py                  | 4 +---
 vllm/model_executor/layers/rotary_embedding.py | 7 +++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
index cc5609ebe5c8e..34879bc4e7ef5 100644
--- a/vllm/executor/hpu_executor.py
+++ b/vllm/executor/hpu_executor.py
@@ -6,8 +6,6 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
-from vllm_hpu_extension.profiler import HabanaMemoryProfiler
-
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -86,7 +84,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         # remains to abstract away the device for non-GPU configurations.
         logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
                     num_cpu_blocks)
-
+        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
         msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 30bcf954c99b5..85cd700c978ea 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -30,10 +30,6 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding,
-                                                 HpuRotaryEmbedding)
-
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
@@ -923,6 +919,7 @@ def get_rope(
 
     if rope_scaling is None:
         if current_platform.is_hpu():
+            from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
             rotary_emb = HpuRotaryEmbedding(head_size,
                                             rotary_dim,
                                             max_position,
@@ -945,6 +942,8 @@ def get_rope(
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
             if current_platform.is_hpu():
+                from vllm_hpu_extension.rotary_embed import (
+                    HpuLlama3RotaryEmbedding)
                 rotary_emb = HpuLlama3RotaryEmbedding(
                     head_size,
                     rotary_dim,

From 1f6de5df8ed22a5ffaf7558e83d8b04a86728f27 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 7 Oct 2024 09:29:21 +0200
Subject: [PATCH 300/341] Lazily import HPU-dependent components (#363)

---
 vllm/executor/hpu_executor.py                  | 4 +---
 vllm/model_executor/layers/rotary_embedding.py | 7 +++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
index cc5609ebe5c8e..34879bc4e7ef5 100644
--- a/vllm/executor/hpu_executor.py
+++ b/vllm/executor/hpu_executor.py
@@ -6,8 +6,6 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
-from vllm_hpu_extension.profiler import HabanaMemoryProfiler
-
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -86,7 +84,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         # remains to abstract away the device for non-GPU configurations.
         logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
                     num_cpu_blocks)
-
+        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
         msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 30bcf954c99b5..85cd700c978ea 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -30,10 +30,6 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding,
-                                                 HpuRotaryEmbedding)
-
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
@@ -923,6 +919,7 @@ def get_rope(
 
     if rope_scaling is None:
         if current_platform.is_hpu():
+            from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
             rotary_emb = HpuRotaryEmbedding(head_size,
                                             rotary_dim,
                                             max_position,
@@ -945,6 +942,8 @@ def get_rope(
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
             if current_platform.is_hpu():
+                from vllm_hpu_extension.rotary_embed import (
+                    HpuLlama3RotaryEmbedding)
                 rotary_emb = HpuLlama3RotaryEmbedding(
                     head_size,
                     rotary_dim,

From ad08dd4e6616206398907c14dee589ffa7081df4 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 7 Oct 2024 09:29:37 +0200
Subject: [PATCH 301/341] [Refactor] Rename HabanaAttention -> HPUAttention
 (#362)

I've missed the attention backend in
https://github.com/HabanaAI/vllm-fork/pull/359
---
 .../backends/{habana_attn.py => hpu_attn.py}  | 37 +++++++++----------
 ...habana_paged_attn.py => hpu_paged_attn.py} |  6 +--
 vllm/attention/selector.py                    | 13 +++----
 3 files changed, 27 insertions(+), 29 deletions(-)
 rename vllm/attention/backends/{habana_attn.py => hpu_attn.py} (88%)
 rename vllm/attention/ops/{habana_paged_attn.py => hpu_paged_attn.py} (95%)

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/hpu_attn.py
similarity index 88%
rename from vllm/attention/backends/habana_attn.py
rename to vllm/attention/backends/hpu_attn.py
index dad33fefc51f3..17201fe6e1cd6 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -13,22 +13,22 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
-                                                  HabanaPagedAttentionMetadata)
+from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
+                                               HPUPagedAttentionMetadata)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-class HabanaAttentionBackend(AttentionBackend):
+class HPUAttentionBackend(AttentionBackend):
 
     @staticmethod
-    def get_impl_cls() -> Type["HabanaAttentionImpl"]:
-        return HabanaAttentionImpl
+    def get_impl_cls() -> Type["HPUAttentionImpl"]:
+        return HPUAttentionImpl
 
     @staticmethod
     def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return HabanaAttentionMetadata
+        return HPUAttentionMetadata
 
     @staticmethod
     def get_state_cls() -> Type["CommonAttentionState"]:
@@ -41,8 +41,8 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> Tuple[int, ...]:
-        return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                       num_kv_heads, head_size)
+        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                    num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -50,20 +50,19 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: Dict[int, int],
     ) -> None:
-        HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache,
-                                         src_to_dst)
+        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: Dict[int, List[int]],
     ) -> None:
-        HabanaPagedAttention.copy_blocks(kv_caches, src_to_dists)
+        HPUPagedAttention.copy_blocks(kv_caches, src_to_dists)
 
 
 @dataclass
-class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata):
-    """Metadata for HabanaAttentionbackend."""
+class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
+    """Metadata for HPUAttentionbackend."""
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     is_prompt: bool
@@ -71,7 +70,7 @@ class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata):
     seq_lens_tensor: Optional[torch.Tensor]
 
 
-class HabanaAttentionImpl(AttentionImpl, torch.nn.Module):
+class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
     """
     If the input tensors contain prompt tokens, the layout is as follows:
     |<--------------- num_prefill_tokens ----------------->|
@@ -126,7 +125,7 @@ def __init__(
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
 
-        suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes()
+        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
@@ -138,7 +137,7 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: HabanaAttentionMetadata,
+        attn_metadata: HPUAttentionMetadata,
         k_scale: float = 1.0,
         v_scale: float = 1.0,
         attn_type: AttentionType = AttentionType.DECODER,
@@ -158,7 +157,7 @@ def forward(
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
-                                      "HabanaAttentionImpl")
+                                      "HPUAttentionImpl")
         batch_size, seq_len, hidden_size = query.shape
         _, seq_len_kv, _ = key.shape
 
@@ -171,7 +170,7 @@ def forward(
             key = key.unflatten(0, (block_indices.size(0), -1))
             value = value.unflatten(0, (block_indices.size(0), -1))
         if kv_cache is not None:
-            key_cache, value_cache = HabanaPagedAttention.split_kv_cache(
+            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
             # Reshape the input keys and values and store them in the cache.
@@ -216,7 +215,7 @@ def forward(
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:
             # Decoding run.
-            output = HabanaPagedAttention.forward_decode(
+            output = HPUPagedAttention.forward_decode(
                 query=query,
                 key_cache=key_cache,
                 value_cache=value_cache,
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
similarity index 95%
rename from vllm/attention/ops/habana_paged_attn.py
rename to vllm/attention/ops/hpu_paged_attn.py
index 7f080e0727457..7fbe26d83f320 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -13,7 +13,7 @@
 
 
 @dataclass
-class HabanaPagedAttentionMetadata:
+class HPUPagedAttentionMetadata:
     """Metadata for PagedAttention."""
     block_list: Optional[torch.Tensor]
     block_mapping: Optional[torch.Tensor]
@@ -22,7 +22,7 @@ class HabanaPagedAttentionMetadata:
     block_offsets: Optional[torch.Tensor]
 
 
-class HabanaPagedAttention:
+class HPUPagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
@@ -76,7 +76,7 @@ def forward_prefix(
         sliding_window: Optional[int],
     ) -> torch.Tensor:
         raise NotImplementedError(
-            "forward_prefix is not implemented for HabanaPagedAttention")
+            "forward_prefix is not implemented for HPUPagedAttention")
 
     @staticmethod
     def swap_blocks(
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index c7a416a78519b..52d3dfa820752 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -22,7 +22,7 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
-    HABANA_ATTN = enum.auto()
+    HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     IPEX = enum.auto()
 
@@ -143,11 +143,10 @@ def get_attn_backend(
         logger.info("Using Flashinfer backend.")
         from vllm.attention.backends.flashinfer import FlashInferBackend
         return FlashInferBackend
-    elif backend == _Backend.HABANA_ATTN:
-        logger.info("Using HabanaAttention backend.")
-        from vllm.attention.backends.habana_attn import (  # noqa: F401
-            HabanaAttentionBackend)
-        return HabanaAttentionBackend
+    elif backend == _Backend.HPU_ATTN:
+        logger.info("Using HPUAttention backend.")
+        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
+        return HPUAttentionBackend
     elif backend == _Backend.PALLAS:
         logger.info("Using Pallas backend.")
         from vllm.attention.backends.pallas import PallasAttentionBackend
@@ -217,7 +216,7 @@ def which_attn_to_use(
         return _Backend.ROCM_FLASH
 
     if current_platform.is_hpu():
-        return _Backend.HABANA_ATTN
+        return _Backend.HPU_ATTN
 
     # FlashAttn in NVIDIA GPUs.
     if selected_backend == _Backend.FLASH_ATTN:

From e00750e2b24b433d157b514725a69ed4e0e58f70 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 7 Oct 2024 09:30:12 +0200
Subject: [PATCH 302/341] Use BF16 on HPU by default (#361)

We don't *officially* support FP16, and for the most part, we use BF16
wherever we can. This removes the need of specifying `--dtype bfloat16`
- when `dtype` is not provided (is `auto`), and model default data type
is `float16`, we cast it to `bfloat16` for HPU.
---
 vllm/config.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 786ed1586a3ea..b3329f1c449ff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1635,6 +1635,13 @@ def _get_and_verify_dtype(
                     torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
+
+            if current_platform.is_hpu() and config_dtype == torch.float16:
+                logger.info(
+                    "For HPU, we cast models to bfloat16 instead of"
+                    "using float16 by default. Please specify `dtype` if you "
+                    "want to use float16.")
+                torch_dtype = torch.bfloat16
         else:
             if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                 raise ValueError(f"Unknown dtype: {dtype}")

From db5aed61529d04b0604a07728a3ce9eb95a2072d Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Mon, 7 Oct 2024 11:58:01 +0200
Subject: [PATCH 303/341] Set vllm-hpu-extension to 36c7f9c (#365)

This includes: https://github.com/HabanaAI/vllm-hpu-extension/pull/8
(BlockSoftmax: fix guard value for fp16)
---
 requirements-hpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 602a5060c29aa..8f7f0339b02e3 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb56d3b
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@36c7f9c

From bb4c23e284f73ed00748a43819432574a96384d8 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 7 Oct 2024 16:54:47 +0300
Subject: [PATCH 304/341] format.sh

---
 vllm/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/utils.py b/vllm/utils.py
index 0bfc9b0adca3c..bf8923e532334 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1424,6 +1424,7 @@ def import_wrapper(name, *args, **kwargs):
     import habana_frameworks.torch as htorch
     htorch.utils.internal.is_lazy.return_value = False
 
+
 def get_beam_search_score(
     tokens: List[int],
     cumulative_logprob: float,

From 563184ab2fc2ab34b5d4ea7b67d146d64538f5c4 Mon Sep 17 00:00:00 2001
From: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com>
Date: Mon, 7 Oct 2024 17:03:36 +0300
Subject: [PATCH 305/341] Fix hpu_set_env call in load_model in vllm (#364)

FILL IN THE PR DESCRIPTION HERE

FIX #xxxx (*link existing issues this PR will resolve*)

**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE
DESCRIPTION ABOVE**

---

<details>
<!-- inside this <details> section, markdown rendering does not work, so
we use raw html here. -->
<summary><b> PR Checklist (Click to Expand) </b></summary>

<p>Thank you for your contribution to vLLM! Before submitting the pull
request, please ensure the PR meets the following criteria. This helps
vLLM maintain the code quality and improve the efficiency of the review
process.</p>

<h3>PR Title and Classification</h3>
<p>Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the
following:</p>
<ul>
    <li><code>[Bugfix]</code> for bug fixes.</li>
<li><code>[CI/Build]</code> for build or continuous integration
improvements.</li>
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
<li><code>[Model]</code> for adding a new model or improving an existing
model. Model name should appear in the title.</li>
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g.,
OpenAI API server, <code>LLM</code> class, etc.) </li>
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other
compute kernels.</li>
<li><code>[Core]</code> for changes in the core vLLM logic (e.g.,
<code>LLMEngine</code>, <code>AsyncLLMEngine</code>,
<code>Scheduler</code>, etc.)</li>
<li><code>[Hardware][Vendor]</code> for hardware-specific changes.
Vendor name should appear in the prefix (e.g.,
<code>[Hardware][AMD]</code>).</li>
<li><code>[Misc]</code> for PRs that do not fit the above categories.
Please use this sparingly.</li>
</ul>
<p><strong>Note:</strong> If the PR spans more than one category, please
include all relevant prefixes.</p>

<h3>Code Quality</h3>

<p>The PR need to meet the following code quality standards:</p>

<ul>
<li>We adhere to <a
href="https://google.github.io/styleguide/pyguide.html">Google Python
style guide</a> and <a
href="https://google.github.io/styleguide/cppguide.html">Google C++
style guide</a>.</li>
<li>Pass all linter checks. Please use <a
href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a>
to format your code.</li>
<li>The code need to be well-documented to ensure future contributors
can easily understand the code.</li>
<li>Include sufficient tests to ensure the project to stay correct and
robust. This includes both unit tests and integration tests.</li>
<li>Please add documentation to <code>docs/source/</code> if the PR
modifies the user-facing behaviors of vLLM. It helps vLLM user
understand and utilize the new features or changes.</li>
</ul>

<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to
be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a
href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom
C++ and CUDA Operators</a> and <a
href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The
Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require
meta-functions. Meta-functions should be implemented and registered in
python so that dynamic dims can be handled automatically. See above
documents for a description of meta-functions.</li>
<li>Use <a
href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a>
to test the function registration and meta-function for any registered
ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must
be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a
href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom
Class Support in PT2</a>.
</ul>

<h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major
architectural changes (>500 LOC excluding kernel/data/config/test), we
would expect a GitHub issue (RFC) discussing the technical design and
justification. Otherwise, we will tag it with <code>rfc-required</code>
and might not go through the PR.</p>

<h3>What to Expect for the Reviews</h3>

<p>The goal of the vLLM team is to be a <i>transparent reviewing
machine</i>. We would like to make the review process transparent and
efficient and make sure no contributor feel confused or frustrated.
However, the vLLM team is small, so we need to prioritize some PRs over
others. Here is what you can expect from the review process: </p>

<ul>
<li> After the PR is submitted, the PR will be assigned to a reviewer.
Every reviewer will pick up the PRs based on their expertise and
availability.</li>
<li> After the PR is assigned, the reviewer will provide status update
every 2-3 days. If the PR is not reviewed within 7 days, please feel
free to ping the reviewer or the vLLM team.</li>
<li> After the review, the reviewer will put an <code>
action-required</code> label on the PR if there are changes required.
The contributor should address the comments and ping the reviewer to
re-review the PR.</li>
<li> Please respond to all comments within a reasonable time frame. If a
comment isn't clear or you disagree with a suggestion, feel free to ask
for clarification or discuss the suggestion.
 </li>
</ul>

<h3>Thank You</h3>

<p> Finally, thank you for taking the time to read these guidelines and
for your interest in contributing to vLLM. Your contributions make vLLM
a great tool for everyone! </p>


</details>
---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 2 +-
 vllm/model_executor/model_loader/loader.py                  | 4 ++--
 vllm/worker/hpu_model_runner.py                             | 4 +++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index bf1aa6fbd5dca..0abb4e0f10546 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -137,7 +137,7 @@ def apply_fp8_linear(
         qinput, x_scale = ops.scaled_fp8_quant(
             input,
             input_scale,
-            num_token_padding=17,
+            batch_dim_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 876da67c02436..618800dee5fbe 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -98,8 +98,8 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability_tuple = current_platform.get_device_capability()
-
+        capability_tuple = current_platform.get_device_capability() \
+                            if current_platform.is_cuda_alike() else None
         if capability_tuple is not None:
             capability = capability_tuple.to_int()
             if capability < quant_config.get_min_capability():
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index b1b62e6bde7f6..d3fa9c287234c 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -591,7 +591,9 @@ def _set_gc_threshold(self) -> None:
 
     def load_model(self) -> None:
         import habana_frameworks.torch.core as htcore
-        htcore.hpu_set_env()
+        if self.model_config.quantization == 'inc' or \
+           self.model_config.quantization == 'fp8':
+            htcore.hpu_set_env()
         with HabanaMemoryProfiler() as m:
             with HabanaMemoryProfiler() as m_getmodel:
                 self.model = get_model(model_config=self.model_config,

From 0e46492dc834b71a82e5bbeb097abbc364717151 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= <mkuligowski@habana.ai>
Date: Tue, 8 Oct 2024 10:15:08 +0200
Subject: [PATCH 306/341] Update offline_inference_fakehpu.py

Beam search was removed from SamplingParams. In this example it was set to False, with this commit I removed it
---
 examples/offline_inference_fakehpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py
index 972d84b60b318..248b5740fa35e 100644
--- a/examples/offline_inference_fakehpu.py
+++ b/examples/offline_inference_fakehpu.py
@@ -21,7 +21,7 @@
     "Wales"
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False)
+sampling_params = SamplingParams(temperature=0, n=1)
 
 # Create an LLM.
 llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4)

From 6028354b838dd1e0670925bd4b1757e728c7b9b9 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 8 Oct 2024 12:23:36 +0200
Subject: [PATCH 307/341] Timeout adjusted in MLLMEngine (#368)

Currently in Multiprocess LLMEngine there is a polling timeout fixed to
10000 ms . This may not be good when
we are running torch compiled models that happen to compile (we did not
have particular configuration -- shape -- model warmed up during warmup
phase). So torch compilation if happens after warmup then 10000ms is not
enough. So It would be good to have a way to modify fixed timeout.

Changes disscussed here are replacing fixed timeout of 10000 ms with
value as provided with VLLM_RPC_TIMEOUT .

Please suggest if separate env var should be made.

Co-authored-by: Jacek Czaja <jczaja@habana.ai>
---
 vllm/engine/multiprocessing/engine.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 3501f12c065cf..8446d23604195 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -33,7 +33,6 @@
 
 logger = init_logger(__name__)
 
-POLLING_TIMEOUT_MS = 10000
 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
 
 
@@ -207,7 +206,7 @@ def run_engine_loop(self):
             self._alive()
             if not self.engine.has_unfinished_requests():
                 # Poll until there is work to do.
-                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+                while self.input_socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
                     self._alive()
                     self.engine.do_log_stats()
                     logger.debug("Waiting for new requests in engine loop.")

From 64369fdff907ae2e3b4194a8aa17d71ce943d25c Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 8 Oct 2024 14:34:03 +0200
Subject: [PATCH 308/341] Add Jenkins test definitions (#369)

---
 .../configs/Meta-Llama-3-70B-Instruct.yaml    |  12 ++
 .../configs/Meta-Llama-3-8B-Instruct.yaml     |  12 ++
 .../configs/Meta-Llama-3.1-8B-Instruct.yaml   |  15 +++
 .../lm-eval-harness/configs/models-large.txt  |   1 +
 .../lm-eval-harness/configs/models-small.txt  |   2 +
 .../run-lm-eval-gsm-vllm-baseline.sh          |  51 ++++++++
 .jenkins/lm-eval-harness/run-tests.sh         |  69 +++++++++++
 .../test_lm_eval_correctness.py               | 115 ++++++++++++++++++
 .jenkins/requirements-test-hpu.txt            |   2 +
 .jenkins/test_config.yaml                     |  24 ++++
 10 files changed, 303 insertions(+)
 create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
 create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
 create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
 create mode 100644 .jenkins/lm-eval-harness/configs/models-large.txt
 create mode 100644 .jenkins/lm-eval-harness/configs/models-small.txt
 create mode 100644 .jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 create mode 100644 .jenkins/lm-eval-harness/run-tests.sh
 create mode 100644 .jenkins/lm-eval-harness/test_lm_eval_correctness.py
 create mode 100644 .jenkins/requirements-test-hpu.txt
 create mode 100644 .jenkins/test_config.yaml

diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
new file mode 100644
index 0000000000000..38965c6197c55
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,12 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.892
+  - name: "exact_match,flexible-extract"
+    value: 0.892
+limit: 250
+num_fewshot: 5
+dtype: "bfloat16"
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
new file mode 100644
index 0000000000000..9fe7d634b887b
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,12 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
+model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.756
+  - name: "exact_match,flexible-extract"
+    value: 0.752
+limit: 250
+num_fewshot: 5
+dtype: "bfloat16"
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
new file mode 100644
index 0000000000000..e2458a8ea4f1c
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
@@ -0,0 +1,15 @@
+# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF
+# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1
+model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct"
+tasks:
+- name: "gsm8k_cot_llama"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.8317
+  - name: "exact_match,flexible-extract"
+    value: 0.8355
+limit: null
+num_fewshot: 8
+dtype: "bfloat16"
+fewshot_as_multiturn: true
+apply_chat_template: true
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/models-large.txt b/.jenkins/lm-eval-harness/configs/models-large.txt
new file mode 100644
index 0000000000000..ca2548d1234a8
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/models-large.txt
@@ -0,0 +1 @@
+Meta-Llama-3-70B-Instruct.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/models-small.txt b/.jenkins/lm-eval-harness/configs/models-small.txt
new file mode 100644
index 0000000000000..d8ae241e58ad3
--- /dev/null
+++ b/.jenkins/lm-eval-harness/configs/models-small.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-8B-Instruct.yaml
+Meta-Llama-3.1-8B-Instruct.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
new file mode 100644
index 0000000000000..65128d6b437e1
--- /dev/null
+++ b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.3
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096,dtype=bfloat16 \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh
new file mode 100644
index 0000000000000..8c51606c4a2dd
--- /dev/null
+++ b/.jenkins/lm-eval-harness/run-tests.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm and compares to "
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
+    echo "  -t    - tensor parallel size"
+    echo
+}
+
+SUCCESS=0
+
+while getopts "c:t:j:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+    
+    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
+
+    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
+    export LM_EVAL_TP_SIZE=$TP_SIZE
+    export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+    export VLLM_SKIP_WARMUP=true
+    RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
+    JUNIT_SUFFIX=""
+    if [[ -n "$TEST_RESULTS_DIR" ]]; then
+        LOG_DIR=$TEST_RESULTS_DIR
+        LOG_FILENAME="$test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
+        LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
+        JUNIT_SUFFIX="--junitxml=${LOG_PATH}"
+    fi
+    pytest -s test_lm_eval_correctness.py $JUNIT_SUFFIX || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
new file mode 100644
index 0000000000000..be90872d8cf6d
--- /dev/null
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -0,0 +1,115 @@
+"""
+LM eval harness on model to compare vs HF baseline computed offline.
+Configs are found in configs/$MODEL.yaml
+
+* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
+* export LM_EVAL_TP_SIZE=4 
+* pytest -s test_lm_eval_correctness.py
+"""
+import atexit
+import itertools
+import os
+import statistics
+import time
+from pathlib import Path
+
+import lm_eval
+import numpy
+import yaml
+
+import vllm
+
+RTOL = 0.05
+TEST_DATA_FILE = os.environ.get(
+    "LM_EVAL_TEST_DATA_FILE",
+    ".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
+
+TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
+
+
+def fail_on_exit():
+    os._exit(1)
+
+
+def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)
+    dtype = eval_config.get('dtype', 'bfloat16')
+    max_num_seqs = eval_config.get('max_num_seqs', 128)
+    model_args = f"pretrained={eval_config['model_name']}," \
+                 f"tensor_parallel_size={TP_SIZE}," \
+                 f"add_bos_token=true," \
+                 f"dtype={dtype}," \
+                 f"max_model_len=4096," \
+                 f"max_num_seqs={max_num_seqs}," \
+                 f"trust_remote_code={trust_remote_code}"
+    kwargs = {}
+    if 'fewshot_as_multiturn' in eval_config:
+        kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn']
+    if 'apply_chat_template' in eval_config:
+        kwargs['apply_chat_template'] = eval_config['apply_chat_template']
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=[task["name"] for task in eval_config["tasks"]],
+        num_fewshot=eval_config["num_fewshot"],
+        limit=eval_config["limit"],
+        batch_size="auto",
+        **kwargs)
+
+    return results
+
+
+def report_performance(task, input_lens, output_lens, time):
+    assert len(input_lens) == len(output_lens)
+    context_lens = [i + o for i, o in zip(input_lens, output_lens)]
+    gen_tput = sum(output_lens) / time
+    msg = (
+        f'{task} | average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004
+        f'{task} | input_tokens   | min: {min(input_lens)} | max: {max(input_lens)} | mean: {statistics.mean(input_lens):.2f} | stddev: {statistics.stdev(input_lens):.2f}\n'  # noqa: E501
+        f'{task} | output_tokens  | min: {min(output_lens)} | max: {max(output_lens)} | mean: {statistics.mean(output_lens):.2f} | stddev: {statistics.stdev(output_lens):.2f}\n'  # noqa: E501
+        f'{task} | context_length | min: {min(context_lens)} | max: {max(context_lens)} | mean: {statistics.mean(context_lens):.2f} | stddev: {statistics.stdev(context_lens):.2f}'  # noqa: E501
+    )
+    print(msg)
+
+
+def test_lm_eval_correctness():
+    eval_config = yaml.safe_load(
+        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+    # Launch eval requests.
+    start_time = time.perf_counter()
+    results = launch_lm_eval(eval_config)
+    total_time = time.perf_counter() - start_time
+
+    tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer(
+        eval_config['model_name'])
+
+    # Confirm scores match ground truth.
+    for task in eval_config["tasks"]:
+
+        samples = results['samples'][task["name"]]
+        tokenized_inputs = [
+            tokenizer(x['arguments'][0][0])['input_ids'] for x in samples
+        ]
+        tokenized_inputs_lens = [len(x) for x in tokenized_inputs]
+        tokenized_outputs = [
+            list(
+                itertools.chain.from_iterable(
+                    tokenizer(list(itertools.chain.from_iterable(
+                        x['resps'])))['input_ids'])) for x in samples
+        ]
+        tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
+        report_performance(task['name'], tokenized_inputs_lens,
+                           tokenized_outputs_lens, total_time)
+
+        for metric in task["metrics"]:
+            ground_truth = metric["value"]
+            measured_value = results["results"][task["name"]][metric["name"]]
+            print(f'{task["name"]} | {metric["name"]}: '
+                  f'ground_truth={ground_truth} | measured={measured_value}')
+            try:
+                assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
+            except AssertionError as exc:
+                # nasty workaround for HPU PT bridge bug (SW-204785)
+                atexit.register(fail_on_exit)
+                raise exc
diff --git a/.jenkins/requirements-test-hpu.txt b/.jenkins/requirements-test-hpu.txt
new file mode 100644
index 0000000000000..e0710d3775957
--- /dev/null
+++ b/.jenkins/requirements-test-hpu.txt
@@ -0,0 +1,2 @@
+lm_eval
+pytest
\ No newline at end of file
diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
new file mode 100644
index 0000000000000..99ff97df8cd34
--- /dev/null
+++ b/.jenkins/test_config.yaml
@@ -0,0 +1,24 @@
+# test_config.yaml
+stages:
+  - name: test_gsm8k_small_models
+    steps:
+      - name: gsm8k_small_g3_tp1
+        flavor: g3
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
+      - name: gsm8k_small_g3_tp2
+        flavor: g3.s
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
+      - name: gsm8k_small_g2_tp1
+        flavor: g2
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
+      - name: gsm8k_small_g2_tp2
+        flavor: g2.s
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
+  - name: test_gsm8k_large_models
+    steps:
+      - name: gsm8k_large_g3_tp2
+        flavor: g3.s
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
+      - name: gsm8k_large_g2_tp2
+        flavor: g2.s
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2

From 388e500aefb27d562c351a861f9fceda44873e6c Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 8 Oct 2024 17:04:11 +0200
Subject: [PATCH 309/341] Make workaround for SW-204785 broader (#374)

PT bridge bug in recent Synapse builds causes PyTest to return 0
unconditionally. Previous workaround fixed that issue if comparison
failed, but left out a case in which vLLM (or anything else) actually
crashes during the test execution. This patch broadens the workaround to
catch any exceptions and add atexit callback when any test fails.
---
 .../test_lm_eval_correctness.py               | 83 ++++++++++---------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
index be90872d8cf6d..fd4532196e36f 100644
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -64,7 +64,7 @@ def report_performance(task, input_lens, output_lens, time):
     context_lens = [i + o for i, o in zip(input_lens, output_lens)]
     gen_tput = sum(output_lens) / time
     msg = (
-        f'{task} | average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004
+        f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004, E501
         f'{task} | input_tokens   | min: {min(input_lens)} | max: {max(input_lens)} | mean: {statistics.mean(input_lens):.2f} | stddev: {statistics.stdev(input_lens):.2f}\n'  # noqa: E501
         f'{task} | output_tokens  | min: {min(output_lens)} | max: {max(output_lens)} | mean: {statistics.mean(output_lens):.2f} | stddev: {statistics.stdev(output_lens):.2f}\n'  # noqa: E501
         f'{task} | context_length | min: {min(context_lens)} | max: {max(context_lens)} | mean: {statistics.mean(context_lens):.2f} | stddev: {statistics.stdev(context_lens):.2f}'  # noqa: E501
@@ -73,43 +73,46 @@ def report_performance(task, input_lens, output_lens, time):
 
 
 def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-
-    # Launch eval requests.
-    start_time = time.perf_counter()
-    results = launch_lm_eval(eval_config)
-    total_time = time.perf_counter() - start_time
-
-    tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer(
-        eval_config['model_name'])
-
-    # Confirm scores match ground truth.
-    for task in eval_config["tasks"]:
-
-        samples = results['samples'][task["name"]]
-        tokenized_inputs = [
-            tokenizer(x['arguments'][0][0])['input_ids'] for x in samples
-        ]
-        tokenized_inputs_lens = [len(x) for x in tokenized_inputs]
-        tokenized_outputs = [
-            list(
-                itertools.chain.from_iterable(
-                    tokenizer(list(itertools.chain.from_iterable(
-                        x['resps'])))['input_ids'])) for x in samples
-        ]
-        tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
-        report_performance(task['name'], tokenized_inputs_lens,
-                           tokenized_outputs_lens, total_time)
-
-        for metric in task["metrics"]:
-            ground_truth = metric["value"]
-            measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
-                  f'ground_truth={ground_truth} | measured={measured_value}')
-            try:
+    try:
+        eval_config = yaml.safe_load(
+            Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+
+        # Launch eval requests.
+        start_time = time.perf_counter()
+        results = launch_lm_eval(eval_config)
+        total_time = time.perf_counter() - start_time
+
+        tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer(
+            eval_config['model_name'])
+
+        # Confirm scores match ground truth.
+        for task in eval_config["tasks"]:
+
+            samples = results['samples'][task["name"]]
+            tokenized_inputs = [
+                tokenizer(x['arguments'][0][0])['input_ids'] for x in samples
+            ]
+            tokenized_inputs_lens = [len(x) for x in tokenized_inputs]
+            tokenized_outputs = [
+                list(
+                    itertools.chain.from_iterable(
+                        tokenizer(
+                            list(itertools.chain.from_iterable(
+                                x['resps'])))['input_ids'])) for x in samples
+            ]
+            tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
+            report_performance(task['name'], tokenized_inputs_lens,
+                               tokenized_outputs_lens, total_time)
+
+            for metric in task["metrics"]:
+                ground_truth = metric["value"]
+                measured_value = results["results"][task["name"]][
+                    metric["name"]]
+                print(
+                    f'{task["name"]} | {metric["name"]}: '
+                    f'ground_truth={ground_truth} | measured={measured_value}')
                 assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
-            except AssertionError as exc:
-                # nasty workaround for HPU PT bridge bug (SW-204785)
-                atexit.register(fail_on_exit)
-                raise exc
+    except Exception as exc:
+        # nasty workaround for a nasty HPU PT bridge bug (SW-204785)
+        atexit.register(fail_on_exit)
+        raise exc

From ca98daec894114a0db87d86d6148b72879893b90 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Wed, 9 Oct 2024 19:00:56 +0300
Subject: [PATCH 310/341] Fix LoRA tests by handling broken imports

---
 tests/lora/test_lora_hpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index a59cfe875ef9c..2f4f7f31c0100 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -1,8 +1,7 @@
 import pytest
 import torch
 from vllm_hpu_extension.ops import LoraMask
-
-from vllm.hpu.punica_hpu import GaudiPunicaWrapper
+from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper
 
 from .utils import DummyLoRAManager
 

From b70c1a5e9fbe29b37ae1b2a79ea953ccf613587d Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 10 Oct 2024 15:11:58 +0200
Subject: [PATCH 311/341] [CI] Report test name, add properties to JUnitXML
 (#377)

---
 .jenkins/lm-eval-harness/run-tests.sh         |  2 +-
 .../test_lm_eval_correctness.py               | 77 +++++++++++++++++--
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh
index 8c51606c4a2dd..09d507d404ede 100644
--- a/.jenkins/lm-eval-harness/run-tests.sh
+++ b/.jenkins/lm-eval-harness/run-tests.sh
@@ -48,7 +48,7 @@ do
         LOG_DIR=$TEST_RESULTS_DIR
         LOG_FILENAME="$test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
         LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
-        JUNIT_SUFFIX="--junitxml=${LOG_PATH}"
+        JUNIT_SUFFIX="-o junit_family=xunit1 --junitxml=${LOG_PATH}"
     fi
     pytest -s test_lm_eval_correctness.py $JUNIT_SUFFIX || LOCAL_SUCCESS=$?
 
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
index fd4532196e36f..9c6d0ee48caf5 100644
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -59,24 +59,81 @@ def launch_lm_eval(eval_config):
     return results
 
 
-def report_performance(task, input_lens, output_lens, time):
+def report_performance(task, input_lens, output_lens, time, record_property):
     assert len(input_lens) == len(output_lens)
     context_lens = [i + o for i, o in zip(input_lens, output_lens)]
     gen_tput = sum(output_lens) / time
+    all_lens = [input_lens, output_lens, context_lens]
+    min_input_tokens, min_output_tokens, min_context_tokens = [
+        min(x) for x in all_lens
+    ]
+    max_input_tokens, max_output_tokens, max_context_tokens = [
+        max(x) for x in all_lens
+    ]
+    mean_input_tokens, mean_output_tokens, mean_context_tokens = [
+        statistics.mean(x) for x in all_lens
+    ]
+    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [
+        statistics.stdev(x) for x in all_lens
+    ]
     msg = (
         f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004, E501
-        f'{task} | input_tokens   | min: {min(input_lens)} | max: {max(input_lens)} | mean: {statistics.mean(input_lens):.2f} | stddev: {statistics.stdev(input_lens):.2f}\n'  # noqa: E501
-        f'{task} | output_tokens  | min: {min(output_lens)} | max: {max(output_lens)} | mean: {statistics.mean(output_lens):.2f} | stddev: {statistics.stdev(output_lens):.2f}\n'  # noqa: E501
-        f'{task} | context_length | min: {min(context_lens)} | max: {max(context_lens)} | mean: {statistics.mean(context_lens):.2f} | stddev: {statistics.stdev(context_lens):.2f}'  # noqa: E501
+        f'{task} | input_tokens   | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n'  # noqa: E501
+        f'{task} | output_tokens  | min: {min_output_tokens} | max: {max_output_tokens} | mean: {mean_output_tokens:.2f} | stddev: {stddev_output_tokens:.2f}\n'  # noqa: E501
+        f'{task} | context_length | min: {min_context_tokens} | max: {max_context_tokens} | mean: {mean_context_tokens:.2f} | stddev: {stddev_context_tokens:.2f}'  # noqa: E501
     )
+
+    # Log all of these stats to JUnitXML
+    record_property(f"{task}_gen_tput", gen_tput)
+    record_property(f"{task}_input_tokens_min", min_input_tokens)
+    record_property(f"{task}_input_tokens_max", max_input_tokens)
+    record_property(f"{task}_input_tokens_mean", mean_input_tokens)
+    record_property(f"{task}_input_tokens_stddev", stddev_input_tokens)
+
+    record_property(f"{task}_output_tokens_min", min_output_tokens)
+    record_property(f"{task}_output_tokens_max", max_output_tokens)
+    record_property(f"{task}_output_tokens_mean", mean_output_tokens)
+    record_property(f"{task}_output_tokens_stddev", stddev_output_tokens)
+
+    record_property(f"{task}_context_tokens_min", min_context_tokens)
+    record_property(f"{task}_context_tokens_max", max_context_tokens)
+    record_property(f"{task}_context_tokens_mean", mean_context_tokens)
+    record_property(f"{task}_context_tokens_stddev", stddev_context_tokens)
+
     print(msg)
 
 
-def test_lm_eval_correctness():
+def get_current_gaudi_platform():
+    """
+    Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
+    """
+    import habana_frameworks.torch.utils.experimental as htexp
+
+    device_type = htexp._get_device_type()
+
+    if device_type == htexp.synDeviceType.synDeviceGaudi:
+        return "Gaudi1"
+    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
+        return "Gaudi2"
+    elif device_type == htexp.synDeviceType.synDeviceGaudi3:
+        return "Gaudi3"
+    else:
+        raise ValueError(
+            f"Unsupported device: the device type is {device_type}.")
+
+
+def test_lm_eval_correctness(record_xml_attribute, record_property):
     try:
         eval_config = yaml.safe_load(
             Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
 
+        # Record JUnitXML test name
+        tasks_str = '_'.join([t['name'] for t in eval_config["tasks"]])
+        platform = get_current_gaudi_platform()
+        testname = (f'test_{Path(TEST_DATA_FILE).stem}_{tasks_str}_{platform}_'
+                    f'tp{TP_SIZE}')
+        record_xml_attribute("name", testname)
+
         # Launch eval requests.
         start_time = time.perf_counter()
         results = launch_lm_eval(eval_config)
@@ -102,7 +159,8 @@ def test_lm_eval_correctness():
             ]
             tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
             report_performance(task['name'], tokenized_inputs_lens,
-                               tokenized_outputs_lens, total_time)
+                               tokenized_outputs_lens, total_time,
+                               record_property)
 
             for metric in task["metrics"]:
                 ground_truth = metric["value"]
@@ -111,6 +169,13 @@ def test_lm_eval_correctness():
                 print(
                     f'{task["name"]} | {metric["name"]}: '
                     f'ground_truth={ground_truth} | measured={measured_value}')
+
+                # Record ground truth and measured value to JUnitXML
+                record_property(
+                    f"{task['name']}_{metric['name']}_ground_truth",
+                    ground_truth)
+                record_property(f"{task['name']}_{metric['name']}_measured",
+                                measured_value)
                 assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
     except Exception as exc:
         # nasty workaround for a nasty HPU PT bridge bug (SW-204785)

From 49444bce5edacdfa9ba16f721a1bf29afa7d73c1 Mon Sep 17 00:00:00 2001
From: Karol Damaszke <kdamaszke@habana.ai>
Date: Fri, 11 Oct 2024 10:45:53 +0200
Subject: [PATCH 312/341] Disable performance counters if profiler is not
 enabled (#383)

Currently, if `HabanaHighLevelProfiler` is not enabled,
`HabanaProfilerCounterHelper` collects the statistics that will not be
used later. This creates additional host overhead that can be removed.
This change will only allow performance statistics to be collected when
the profiler is enabled.

Potential gain on `prepare_model_input`:
- before
<img width="437" alt="image"
src="https://github.com/user-attachments/assets/c351c6be-2757-455d-a005-b34e97d47fd6">

- after
<img width="401" alt="image"
src="https://github.com/user-attachments/assets/80b7c1d1-051e-4a64-9e7c-eff9cc8d9558">
---
 vllm/worker/hpu_model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d3fa9c287234c..b50e9451ea09c 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -521,7 +521,6 @@ def __init__(
         self.prompt_adapter_config = prompt_adapter_config
         self.return_hidden_states = return_hidden_states
         self.observability_config = observability_config
-        self.profiler = HabanaHighLevelProfiler()
 
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)
@@ -557,6 +556,7 @@ def __init__(
         self.inc_initialized_successfully = False
 
         # Profiler stats
+        self.profiler = HabanaHighLevelProfiler()
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
         self.seen_configs: set = set()
         self._mem_margin: Optional[int] = None
@@ -1767,8 +1767,9 @@ def prepare_model_input(
         """
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
             assert seq_group_metadata_list is not None
-            self.profiler_counter_helper.capture_seq_group_metadata_stats(
-                seq_group_metadata_list=seq_group_metadata_list)
+            if self.profiler.enabled:
+                self.profiler_counter_helper.capture_seq_group_metadata_stats(
+                    seq_group_metadata_list=seq_group_metadata_list)
             model_input, sampling_metadata = self.prepare_input_tensors(
                 seq_group_metadata_list)
             assert model_input.attn_metadata is not None

From d6bd37505c82f305124813a4e2be8b0b945833fd Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Fri, 11 Oct 2024 08:16:27 +0300
Subject: [PATCH 313/341] Remove constraints for bucket creation during warmup
 in LoRA

---
 vllm/worker/hpu_model_runner.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d3fa9c287234c..7d6d0cd8af4f7 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -672,9 +672,6 @@ def _is_valid_bucket(self, bucket):
     def _setup_buckets(self) -> None:
         align_bs = lambda x: min(self.max_num_seqs, x)
         max_bucket_cfg = 64
-        if self.lora_config and \
-            max_bucket_cfg > self.max_num_batched_tokens // self.block_size:
-            max_bucket_cfg = self.max_num_batched_tokens // self.block_size
         #FIXME: The default values should be max_model_len
         max_prompt_seq = 1024
         max_decode_seq = 2048
@@ -1480,11 +1477,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets(
             self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg,
             self.max_num_batched_tokens)
-        if self.lora_config:
-            self.prompt_buckets[:] = [
-                bucket for bucket in self.prompt_buckets
-                if self._is_valid_bucket(bucket)
-            ]
 
         msg = (
             f"Generated {len(self.prompt_buckets)} "
@@ -1502,11 +1494,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.decode_buckets = generate_decode_buckets(
             self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg,
             max_blocks)
-        if self.lora_config:
-            self.decode_buckets[:] = [
-                bucket for bucket in self.decode_buckets
-                if self._is_valid_bucket(bucket)
-            ]
         logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
                     len(self.decode_buckets),
                     list(sorted(self.decode_buckets)))

From d8f2aa70198a07836ee91df23aa6a234f700c955 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Mon, 14 Oct 2024 12:50:58 +0530
Subject: [PATCH 314/341] seed_everything function doesn't handle HPU (#384)

This PR adds manual seed setting for HPU in the function
`seed_everything`.

Previously the torch.manual_seed was getting set to the given seed,
which got removed in the following PR
https://github.com/HabanaAI/vllm-fork/commit/6ffa3f314c59e42238f1c5f923ff2839e0af9698
---
 vllm/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/utils.py b/vllm/utils.py
index 2ff9668d9a463..6b325458e62a9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -413,6 +413,9 @@ def seed_everything(seed: int) -> None:
     if is_xpu():
         torch.xpu.manual_seed_all(seed)
 
+    if current_platform.is_hpu():
+        torch.hpu.manual_seed_all(seed)
+
 
 def random_uuid() -> str:
     return str(uuid.uuid4().hex)

From 03b407bdf2be755b29ccc72dfadb23ebad8b7b4c Mon Sep 17 00:00:00 2001
From: Ruheena Suhani Shaik <rsshaik@habana.ai>
Date: Mon, 14 Oct 2024 13:02:23 +0530
Subject: [PATCH 315/341] Fixed lora_manager tests with hpu_model_runner (#386)

lora_manager tests have been fixed with the recent changes of
hpu_model_runner from habana_model_runner
---
 tests/lora/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 65e38b2e4e6e4..d5ce1906c40c1 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -262,7 +262,7 @@ def get_model_patched(*, model_config, device_config, **kwargs):
                              **kwargs)
 
     if current_platform.is_hpu():
-        with patch("vllm.worker.habana_model_runner.get_model",
+        with patch("vllm.worker.hpu_model_runner.get_model",
                    get_model_patched):
             engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
     else:

From ebd42c4bc9314c9d3fa240fe19462bb3df4704ce Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 14 Oct 2024 14:27:30 +0200
Subject: [PATCH 316/341] Reformat README_GAUDI.md (#389)

This PR removes the awkward line breaks in README_GAUDI.md and uses
appropriate markdown formatting instead of RST. Rendered document should
look the same.
---
 README_GAUDI.md | 560 ++++++++++++++----------------------------------
 1 file changed, 161 insertions(+), 399 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 483b6e6cda741..08458251a753d 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -1,217 +1,126 @@
-vLLM with Intel® Gaudi® AI Accelerators
-=======================================
+# vLLM with Intel® Gaudi® AI Accelerators
 
-This README provides instructions on running vLLM with Intel Gaudi
-devices.
+This README provides instructions on running vLLM with Intel Gaudi devices.
 
-Requirements and Installation
-=============================
+# Requirements and Installation
 
-Please follow the instructions provided in the [Gaudi Installation
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the environment. To achieve the best performance, please
-follow the methods outlined in the [Optimizing Training Platform
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-Requirements
-------------
+## Requirements
 
--   OS: Ubuntu 22.04 LTS
--   Python: 3.10
--   Intel Gaudi accelerator
--   Intel Gaudi software version 1.17.0
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.17.0
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
-``` {.console}
+```{.console}
 $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
 $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
 $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
 $ pip list | grep neural # verify that neural-compressor is installed
 ```
 
-Refer to [Intel Gaudi Software Stack
-Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details.
 
-Run Docker Image
-----------------
+## Run Docker Image
 
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
-documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
+It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
 
 Use the following commands to run a Docker image:
 
-``` {.console}
+```{.console}
 $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 ```
 
-Build and Install vLLM
-----------------------
+## Build and Install vLLM
 
-Currently, the latest features and performance optimizations are
-developed in Gaudi\'s [vLLM-fork](https://github.com/HabanaAI/vllm-fork)
-and we periodically upstream them to vLLM main repo. To install latest
-[HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the
-following:
+Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
 
-``` {.console}
+```{.console}
 $ git clone https://github.com/HabanaAI/vllm-fork.git
 $ cd vllm-fork
 $ git checkout habana_main
 $ pip install -e .
 ```
 
-Supported Features
-==================
-
--   [Offline batched
-    inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference)
--   Online inference via [OpenAI-Compatible
-    Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server)
--   HPU autodetection - no need to manually select device within vLLM
--   Paged KV cache with algorithms enabled for Intel Gaudi accelerators
--   Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-    prefill attention, Root Mean Square Layer Normalization, Rotary
-    Positional Encoding
--   Tensor parallelism support for multi-card inference
--   Inference with [HPU
-    Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-    for accelerating low-batch latency and throughput
--   Attention with Linear Biases (ALiBi)
--   INC quantization
-
-Unsupported Features
-====================
-
--   Beam search
--   LoRA adapters
--   AWQ quantization
--   Prefill chunking (mixed-batch inferencing)
-
-Supported Configurations
-========================
-
-The following configurations have been validated to be function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
--   [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
-    on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
-    datatype with random or greedy sampling
--   [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
-    with tensor parallelism on 8x HPU, BF16 datatype with random or
-    greedy sampling
--   [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
-    on single HPU or with tensor parallelism on 2x HPU, BF16 datatype
-    with random or greedy sampling
--   [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
-    with tensor parallelism on 2x HPU, BF16 datatype with random or
-    greedy sampling
-
-Performance Tuning 
-================
-
-Execution modes 
------------------------------
-
-Currently in vLLM for HPU we support four execution modes, depending on
-selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment
-variable), and `--enforce-eager` flag.
-
-| `PT_HPU_LAZY_MODE` 	| `enforce_eager` 	| execution mode |
-|---	|---	|---	|
-| 0 	| 0 	| torch.compile 	|
-| 0 	| 1 	| PyTorch eager mode 	|
-| 1 	| 0 	| HPU Graphs 	|
-| 1 	| 1 	| PyTorch lazy mode 	|
-
-
-> [!WARNING]
-> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly
-> experimental and should be only used for validating functional
-> correctness. Their performance will be improved in the next releases.
-> For obtaining the best performance in 1.17.0, please use HPU Graphs, or
-> PyTorch lazy mode.
-
-Bucketing mechanism 
------------------------------
-
-Intel Gaudi accelerators work best when operating on models with fixed
-tensor shapes. [Intel Gaudi Graph
-Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime)
-is responsible for generating optimized binary code that implements the
-given model topology on Gaudi. In its default configuration, the
-produced binary code may be heavily dependent on input and output tensor
-shapes, and can require graph recompilation when encountering
-differently shaped tensors within the same topology. While the resulting
-binaries utilize Gaudi efficiently, the compilation itself may introduce
-a noticeable overhead in end-to-end execution. In a dynamic inference
-serving scenario, there is a need to minimize the number of graph
-compilations and reduce the risk of graph compilation occurring during
-server runtime. Currently it is achieved by \"bucketing\" model\'s
-forward pass across two dimensions - `batch_size` and `sequence_length`.
+# Supported Features
 
-> [!NOTE] 
-> Bucketing allows us to reduce the number of required graphs
-> significantly, but it does not handle any graph compilation and device
-> code generation - this is done in warmup and HPUGraph capture phase.
+- [Offline batched inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference)
+- Online inference via [OpenAI-Compatible Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server)
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+- INC quantization
+
+# Unsupported Features
+
+- Beam search
+- LoRA adapters
+- AWQ quantization
+- Prefill chunking (mixed-batch inferencing)
+
+# Supported Configurations
+
+The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work.
+
+- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
+- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
+
+# Performance Tuning
+
+## Execution modes
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
+
+| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode     |
+| ------------------ | --------------- | ------------------ |
+| 0                  | 0               | torch.compile      |
+| 0                  | 1               | PyTorch eager mode |
+| 1                  | 0               | HPU Graphs         |
+| 1                  | 1               | PyTorch lazy mode  |
+
+> [!WARNING] 
+> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode.
+
+## Bucketing mechanism
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
+
+> [!NOTE]
+> Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
 
-Bucketing ranges are determined with 3 parameters - `min`, `step` and
-`max`. They can be set separately for prompt and decode phase, and for
-batch size and sequence length dimension. These parameters can be
-observed in logs during vLLM startup:
+Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
-``` {.}
+```{.}
 INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
 INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
 ```
 
-`min` determines the lowest value of the bucket. `step` determines the
-interval between buckets, and `max` determines the upper bound of the
-bucket. Furthermore, interval between `min` and `step` has special
-handling - `min` gets multiplied by consecutive powers of two, until
-`step` gets reached. We call this the ramp-up phase and it is used for
-handling lower batch sizes with minimum wastage, while allowing larger
-padding on larger batch sizes.
+`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
 
 Example (with ramp-up)
 
-``` {.}
+```{.}
 min = 2, step = 32, max = 64
 => ramp_up = (2, 4, 8, 16)
 => stable = (32, 64)
@@ -220,53 +129,28 @@ min = 2, step = 32, max = 64
 
 Example (without ramp-up)
 
-``` {.}
+```{.}
 min = 128, step = 128, max = 512
 => ramp_up = ()
 => stable = (128, 256, 384, 512)
 => buckets = ramp_up + stable => (128, 256, 384, 512)
 ```
 
-In the logged scenario, 24 buckets were generated for prompt (prefill)
-runs, and 48 buckets for decode runs. Each bucket corresponds to a
-separate optimized device binary for a given model with specified tensor
-shapes. Whenever a batch of requests is processed, it is padded across
-batch and sequence length dimension to the smallest possible bucket.
-
-> [!WARNING]
-> If a request exceeds maximum bucket size in any dimension, it will be
-> processed without padding, and its processing may require a graph
-> compilation, potentially significantly increasing end-to-end latency.
-> The boundaries of the buckets are user-configurable via environment
-> variables, and upper bucket boundaries can be increased to avoid such
-> scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of
-412 comes in to an idle vLLM server, it will be padded executed as
-`(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be
-padded to 4 (closest batch\_size dimension higher than 3), and max
-sequence length will be padded to 512 (closest sequence length dimension
-higher than 412). After prefill stage, it will be executed as `(4, 512)`
-decode bucket and will continue as that bucket until either batch
-dimension changes (due to request being finished) - in which case it
-will become a `(2, 512)` bucket, or context length increases above 512
-tokens, in which case it will become `(4, 640)` bucket.
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+> [!WARNING] 
+> If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
 > [!NOTE]
-> Bucketing is transparent to a client - padding in sequence length
-> dimension is never returned to the client, and padding in batch
-> dimension does not create new requests.
+> Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 
-Warmup
-------
+## Warmup
 
-Warmup is an optional, but highly recommended step occurring before vLLM
-server starts listening. It executes a forward pass for each bucket with
-dummy data. The goal is to pre-compile all graphs and not incur any
-graph compilation overheads within bucket boundaries during server
-runtime. Each warmup step is logged during vLLM startup:
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 
-``` {.}
+```{.}
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
 INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
 INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
@@ -280,100 +164,30 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in *Bucketing mechanism* section.
-Each output line corresponds to execution of a single bucket. When
-bucket is executed for the first time, its graph is compiled and can be
-reused later on, skipping further graph compilations.
-
-> [!TIP]
-> Compiling all the buckets might take some time and can be turned off
-> with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if
-> you do that, you may face graph compilations once executing a given
-> bucket for the first time. It is fine to disable warmup for development,
-> but it\'s highly recommended to enable it in deployment.
-
-HPU Graph capture
------------------------------
-
-[HPU
-Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-are currently the most performant execution method of vLLM on Intel
-Gaudi. When HPU Graphs are enabled, execution graphs will be traced
-(recorded) ahead of time (after performing warmup), to be later replayed
-during inference, significantly reducing host overheads. Recording can
-take large amounts of memory, which needs to be taken into account when
-allocating KV cache. Enabling HPU Graphs will impact the number of
-available KV cache blocks, but vLLM provides user-configurable variables
-to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool
-(\"usable memory\") as KV cache, determined by `gpu_memory_utilization`
-flag (`0.9` by default). Before KV cache gets allocated, model weights
-are loaded onto the device, and a forward pass of the model is executed
-on dummy data, to estimate memory usage. Only after that,
-`gpu_memory_utilization` flag is utilized - at its default value, will
-mark 90% of free device memory at that point as usable. Next, KV cache
-gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
-memory reserved for HPU Graphs capture. With its default value
-(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved
-for graph capture (later referred to as \"usable graph memory\"), and
-the remaining 90% will be utilized for KV cache. Environment variable
-`VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
-reserved for prefill and decode graphs. By default
-(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory
-constraints. Lower value corresponds to less usable graph memory
-reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will
-reserve 20% of usable graph memory for prefill graphs, and 80% of usable
-graph memory for decode graphs.
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-> [!NOTE]
-> `gpu_memory_utilization` does not correspond to the absolute memory
-> usage across HPU. It specifies the memory margin after loading the model
-> and performing a profile run. If device has 100 GiB of total memory, and
-> 50 GiB of free memory after loading model weights and executing
-> profiling run, `gpu_memory_utilization` at its default value will mark
-> 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total
-> device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt
-and decode stages separately. Strategy affects the order of capturing
-graphs. There are two strategies implemented: - `max_bs` - graph capture
-queue will sorted in descending order by their batch sizes. Buckets with
-equal batch sizes are sorted by sequence length in ascending order (e.g.
-`(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`,
-`(1,256)`), default strategy for decode - `min_tokens` - graph capture
-queue will be sorted in ascending order by the number of tokens each
-graph processes (`batch_size*sequence_length`), default strategy for
-prompt
-
-When there\'s large amount of requests pending, vLLM scheduler will
-attempt to fill the maximum batch size for decode as soon as possible.
-When a request is finished, decode batch size decreases. When that
-happens, vLLM will attempt to schedule a prefill iteration for requests
-in the waiting queue, to fill the decode batch size to its previous
-state. This means that in a full load scenario, decode batch size is
-often at its maximum, which makes large batch size HPU Graphs crucial to
-capture, as reflected by `max_bs` strategy. On the other hand, prefills
-will be executed most frequently with very low batch sizes (1-4), which
-is reflected in `min_tokens` strategy.
+> [!TIP] 
+> Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
 
-> [!NOTE]
-> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by
-> graphs for each stage (prefill and decode). vLLM will first attempt to
-> use up entirety of usable prefill graph memory (usable graph memory \*
-> `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it
-> will attempt do the same for decode graphs and usable decode graph
-> memory pool. If one stage is fully captured, and there is unused memory
-> left within usable graph memory pool, vLLM will attempt further graph
-> capture for the other stage, until no more HPU Graphs can be captured
-> without exceeding reserved memory pool. The behavior on that mechanism
-> can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative
-values correspond to memory being released):
-
-``` {.}
+## HPU Graph capture
+
+[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+> [!NOTE] 
+> `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: - `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode - `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
+
+> [!NOTE] 
+> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+```{.}
 INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
 INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
 INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
@@ -405,111 +219,59 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, alloca
 INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
 ```
 
-Recommended vLLM Parameters
------------------------------
+## Recommended vLLM Parameters
 
--   We recommend running inference on Gaudi 2 with `block_size` of 128
-    for BF16 data type. Using default values (16, 32) might lead to
-    sub-optimal performance due to Matrix Multiplication Engine
-    under-utilization (see [Gaudi
-    Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
--   For max throughput on Llama 7B, we recommend running with batch size
-    of 128 or 256 and max context length of 2048 with HPU Graphs
-    enabled. If you encounter out-of-memory issues, see troubleshooting
-    section.
+- We recommend running inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using default values (16, 32) might lead to sub-optimal performance due to Matrix Multiplication Engine under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
+- For max throughput on Llama 7B, we recommend running with batch size of 128 or 256 and max context length of 2048 with HPU Graphs enabled. If you encounter out-of-memory issues, see troubleshooting section.
 
-Environment variables
------------------------------
+## Environment variables
 
 **Diagnostic and profiling knobs:**
 
--   `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be
-    enabled. Resulting JSON traces can be viewed in
-    [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled
-    by default.
--   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph
-    compilations per each vLLM engine step, only when there was any -
-    highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`.
-    Disabled by default.
--   `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph
-    compilations per each vLLM engine step, always, even if there were
-    none. Disabled by default.
--   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks
-    per each vLLM engine step, only when there was any. Disabled by
-    default.
--   `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu
-    fallbacks per each vLLM engine step, always, even if there were
-    none. Disabled by default.
+- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
 
 **Performance tuning knobs:**
 
--   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
-    default
--   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
-    HPUGraph capture, `0.1` by default
--   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
-    dedicated for prompt graphs, `0.3` by default
--   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt
-    graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
--   `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode
-    graph capture, `min_tokens` or `max_bs`, `max_bs` by default
--   `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment
-    variables configuring ranges of bucketing mechanism
-    -   `{phase}` is either `PROMPT` or `DECODE`
-    -   `{dim}` is either `BS`, `SEQ` or `BLOCK`
-    -   `{param}` is either `MIN`, `STEP` or `MAX`
-    -   Default values:
-        - Prompt:
-           -   batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-           -   batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
-           -   batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`):
-                    `min(max_num_seqs, 64)`
-           -   sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`):
-                    `block_size`
-           -   sequence length step
-                    (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-           -   sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`):
-                    `max_model_len`
-
-        - Decode:
-            - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
-            -   batch size step (`VLLM_DECODE_BS_BUCKET_STEP`):
-                    `min(max_num_seqs, 32)`
-            -   batch size max (`VLLM_DECODE_BS_BUCKET_MAX`):
-                    `max_num_seqs`
-            -   block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`):
-                    `block_size`
-            -   block size step
-                    (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
-            -   block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`):
-                    `max(128, (max_num_seqs*max_model_len)/block_size)`
-
-Additionally, there are HPU PyTorch Bridge environment variables
-impacting vLLM execution:
-
--   `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be
-    used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is
-    default
--   `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor
-    parallel inference with HPU Graphs
-
-Troubleshooting: Tweaking HPU Graphs
-====================================
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
--   Tweak `gpu_memory_utilization` knob. It will decrease the allocation
-    of KV cache, leaving some headroom for capturing graphs with larger
-    batch size. By default `gpu_memory_utilization` is set to 0.9. It
-    attempts to allocate \~90% of HBM left for KV cache after short
-    profiling run. Note that decreasing reduces the number of KV cache
-    blocks you have available, and therefore reduces the effective
-    maximum number of tokens you can handle at a given time.
--   If this method is not efficient, you can disable `HPUGraph`
-    completely. With HPU Graphs disabled, you are trading latency and
-    throughput at lower batches for potentially higher throughput on
-    higher batches. You can do that by adding `--enforce-eager` flag to
-    server (for online inference), or by passing `enforce_eager=True`
-    argument to LLM constructor (for offline inference).
+- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
+- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
+- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
+- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
+- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
+- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
+  - `{phase}` is either `PROMPT` or `DECODE`
+  - `{dim}` is either `BS`, `SEQ` or `BLOCK`
+  - `{param}` is either `MIN`, `STEP` or `MAX`
+  - Default values:
+    - Prompt:
+
+      - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)`
+      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+
+    - Decode:
+
+      - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
+      - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
+      - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
+      - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
+      - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
+      - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)`
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
+
+# Troubleshooting: Tweaking HPU Graphs
+
+If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below:
+
+- Tweak `gpu_memory_utilization` knob. It will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9. It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that decreasing reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum number of tokens you can handle at a given time.
+- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference).

From 2d2bf7a03d3d7461c21029910370379644d22aa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Kot=C5=82owski?= <Andrzej.Kotlowski@intel.com>
Date: Mon, 14 Oct 2024 14:32:12 +0200
Subject: [PATCH 317/341] [CI] Prepare separate Jenkins tests for torch compile
 mode (#388)

---
 .jenkins/test_config_t_compile.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .jenkins/test_config_t_compile.yaml

diff --git a/.jenkins/test_config_t_compile.yaml b/.jenkins/test_config_t_compile.yaml
new file mode 100644
index 0000000000000..58fcb45a7edfb
--- /dev/null
+++ b/.jenkins/test_config_t_compile.yaml
@@ -0,0 +1,16 @@
+# test_config_t_compile.yaml
+stages:
+  - name: test_gsm8k_small_models_tcompile
+    steps:
+      - name: gsm8k_small_g3_tp1_tc
+        flavor: g3
+        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
+      - name: gsm8k_small_g3_tp2_tc
+        flavor: g3.s
+        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2
+      - name: gsm8k_small_g2_tp1_tc
+        flavor: g2
+        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
+      - name: gsm8k_small_g2_tp2_tc
+        flavor: g2.s
+        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2

From 9df1d4abda18c7b0eec4c8e4edf9437fbb3f7ea0 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Mon, 14 Oct 2024 18:04:13 +0530
Subject: [PATCH 318/341] Remove workaround added to resolve multi-card stall
 issue (#387)

This PR removes additional `multiprocessing.Process` object created as a
workaround for resolving multi-card stall issue.
---
 tests/lora/test_llama_hpu.py     | 18 +++---------------
 tests/lora/test_multilora_hpu.py | 19 +++----------------
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py
index dfd551f2ca043..5571d727ef8e2 100644
--- a/tests/lora/test_llama_hpu.py
+++ b/tests/lora/test_llama_hpu.py
@@ -1,4 +1,3 @@
-from multiprocessing import Process
 from typing import List
 
 from conftest import cleanup
@@ -78,23 +77,12 @@ def _test_llama_lora(sql_lora_files, tp_size):
 
 
 def test_llama_lora_1x(sql_lora_files):
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 1))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_lora(sql_lora_files, 1)
 
 
 def test_llama_lora_2x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 2))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_lora(sql_lora_files, 2)
 
 
 def test_llama_lora_4x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_lora, args=(sql_lora_files, 4))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_lora(sql_lora_files, 4)
diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py
index 64eda037ff059..d035761923dd6 100644
--- a/tests/lora/test_multilora_hpu.py
+++ b/tests/lora/test_multilora_hpu.py
@@ -1,4 +1,3 @@
-from multiprocessing import Process
 from typing import List, Optional, Tuple
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -107,24 +106,12 @@ def _test_llama_multilora(sql_lora_files, tp_size):
 
 
 def test_llama_multilora_1x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_multilora(sql_lora_files, 1)
 
 
 def test_llama_multilora_2x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_multilora(sql_lora_files, 2)
 
 
 def test_llama_multilora_4x(sql_lora_files):
-    # Work-around to resolve stalling issue in multi-card scenario
-    p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4))
-    p.start()
-    p.join()
-    assert p.exitcode == 0
+    _test_llama_multilora(sql_lora_files, 4)

From 9777c9f8538b497d5f6cb986d4535db0185edf49 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 14 Oct 2024 14:46:11 +0200
Subject: [PATCH 319/341] Update SynapseAI version in README & Dockerfile
 (#390)

---
 Dockerfile.hpu                                     | 4 +---
 README_GAUDI.md                                    | 8 ++++----
 docs/source/getting_started/gaudi-installation.rst | 8 ++++----
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index ab714cdac4670..f481c8c6a57bf 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 
 COPY ./ /workspace/vllm
 
@@ -13,6 +13,4 @@ RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README_GAUDI.md b/README_GAUDI.md
index 08458251a753d..555cd1738b909 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -11,7 +11,7 @@ Please follow the instructions provided in the [Gaudi Installation Guide](https:
 - OS: Ubuntu 22.04 LTS
 - Python: 3.10
 - Intel Gaudi accelerator
-- Intel Gaudi software version 1.17.0
+- Intel Gaudi software version 1.18.0
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -31,8 +31,8 @@ It is highly recommended to use the latest Docker image from Intel Gaudi vault.
 Use the following commands to run a Docker image:
 
 ```{.console}
-$ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
-$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
 ## Build and Install vLLM
@@ -98,7 +98,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
 | 1                  | 1               | PyTorch lazy mode  |
 
 > [!WARNING] 
-> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode.
+> In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 
 ## Bucketing mechanism
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index c9df862197f0a..111bab2494990 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -18,7 +18,7 @@ Requirements
 -  OS: Ubuntu 22.04 LTS
 -  Python: 3.10
 -  Intel Gaudi accelerator
--  Intel Gaudi software version 1.17.0
+-  Intel Gaudi software version 1.18.0
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -45,8 +45,8 @@ Use the following commands to run a Docker image:
 
 .. code:: console
 
-   $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
-   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+   $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 
 Build and Install vLLM
 ---------------------------
@@ -157,7 +157,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected
      - PyTorch lazy mode
 
 .. warning::
-   In 1.17.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode.
+   In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
 
 
 Bucketing mechanism

From 9ac52ab11186926530648385135b8a8f7eadfe7f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 14 Oct 2024 17:01:02 +0300
Subject: [PATCH 320/341] fix attention backend selector:

---
 vllm/worker/hpu_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 99dc326612588..f81e4aa59b289 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -541,13 +541,12 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
 
         self.attn_backend = get_attn_backend(
-            self.model_config.get_num_attention_heads(self.parallel_config),
             self.model_config.get_head_size(),
-            self.model_config.get_num_kv_heads(self.parallel_config),
             self.model_config.get_sliding_window(),
             self.model_config.dtype,
             self.kv_cache_dtype,
             self.block_size,
+            self.model_config.is_attention_free,
         )
 
         # Lazy initialization

From 55dd07e949db7fb2839c4d91b175ea76985a3257 Mon Sep 17 00:00:00 2001
From: Dudi Lester <160421192+dudilester@users.noreply.github.com>
Date: Tue, 15 Oct 2024 10:46:45 +0300
Subject: [PATCH 321/341] enable mixtral quantization using INC (#372)

---
 requirements-hpu.txt                          |  2 +-
 vllm/executor/ray_hpu_executor.py             |  3 +
 vllm/model_executor/layers/fused_moe/layer.py | 97 +++++++------------
 3 files changed, 37 insertions(+), 65 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 8f7f0339b02e3..8495d63ce72fa 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@36c7f9c
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@7531cc6
diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py
index 343fa43b0eda1..775c0a5d95899 100644
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
@@ -78,6 +78,9 @@ def shutdown(self) -> None:
                 ray.kill(worker)
             self.forward_dag = None
 
+    def finish_measurements(self):
+        self._run_workers("finish_measurements")
+
     def _get_worker_module_and_class(
         self
     ) -> Tuple[str, str, Optional[Callable[[],
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 865f5c6aad1eb..457450cda2ce6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -14,6 +14,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
+is_hpu = current_platform.is_hpu()
+
 logger = init_logger(__name__)
 
 
@@ -262,7 +264,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                                                  expert_data: torch.Tensor,
                                                  shard_id: str,
                                                  loaded_weight: torch.tensor,
-                                                 tp_rank: int):
+                                                 tp_rank: int, expert_id: int):
         # Load grouped weight scales for group quantization
         # or model weights
         if shard_id == "w2":
@@ -270,13 +272,15 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                           shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
-                          tp_rank=tp_rank)
+                          tp_rank=tp_rank,
+                          expert_id=expert_id)
         elif shard_id in ("w1", "w3"):
             self._load_w13(shard_id=shard_id,
                            shard_dim=shard_dim,
                            loaded_weight=loaded_weight,
                            expert_data=expert_data,
-                           tp_rank=tp_rank)
+                           tp_rank=tp_rank,
+                           expert_id=expert_id)
 
     def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                        shard_dim: int, shard_id: str,
@@ -292,9 +296,15 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                            expert_data=expert_data,
                            tp_rank=tp_rank)
 
-    def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
-                  shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+    def _load_w13(self,
+                  expert_data: torch.Tensor,
+                  shard_dim: int,
+                  shard_id: str,
+                  loaded_weight: torch.tensor,
+                  tp_rank: int,
+                  expert_id: Optional[int] = None):
 
+        orig_exp_data = expert_data.view(expert_data.size())
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
         shard_size = expert_data.shape[shard_dim] // 2
@@ -310,8 +320,17 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
             expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
         expert_data.copy_(loaded_weight)
 
-    def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
-                 shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
+        if is_hpu:
+            self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                orig_exp_data)
+
+    def _load_w2(self,
+                 expert_data: torch.Tensor,
+                 shard_dim: int,
+                 shard_id: str,
+                 loaded_weight: torch.tensor,
+                 tp_rank: int,
+                 expert_id: Optional[int] = None):
 
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
@@ -321,6 +340,9 @@ def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
                                              shard_size)
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
+        if is_hpu:
+            self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
+                expert_data)
 
     def _load_single_value(self, param: torch.nn.Parameter,
                            loaded_weight: torch.Tensor, expert_id: int):
@@ -423,7 +445,8 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank)
+                    tp_rank=tp_rank,
+                    expert_id=expert_id)
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                 self._load_per_tensor_weight_scale(shard_id=shard_id,
                                                    param=param,
@@ -449,7 +472,8 @@ def weight_loader(self, param: torch.nn.Parameter,
                 shard_dim=shard_dim,
                 loaded_weight=loaded_weight,
                 expert_data=expert_data,
-                tp_rank=tp_rank)
+                tp_rank=tp_rank,
+                expert_id=expert_id)
             return
 
     @staticmethod
@@ -528,58 +552,3 @@ def make_expert_params_mapping(
                 ("w3", ckpt_up_proj_name),
             ]
         ]
-
-    def _load_fp8_scale(self, param: torch.nn.Parameter,
-                        loaded_weight: torch.Tensor, weight_name: str,
-                        shard_id: str, expert_id: int) -> None:
-        param_data = param.data
-
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            # If we are in merged column case (gate_up_proj)
-            if shard_id in ("w1", "w3"):
-                # We have to keep the weight scales of w1 and w3 because
-                # we need to re-quantize w1/w3 weights after weight loading.
-                idx = 0 if shard_id == "w1" else 1
-                param_data[expert_id][idx] = loaded_weight
-            # If we are in the row parallel case (down_proj)
-            else:
-                param_data[expert_id] = loaded_weight
-        # Weights
-        else:
-            tp_rank = get_tensor_model_parallel_rank()
-            shard_size = self.intermediate_size_per_partition
-            shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-
-            # w1, gate_proj case: Load into first shard of w13.
-            if shard_id == 0:
-                param_data[expert_id,
-                           0:shard_size, :] = loaded_weight[shard, :]
-                if current_platform.is_hpu():
-                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
-                        param_data[expert_id])
-            # w3, up_proj case: Load into second shard of w13.
-            elif shard_id == 2:
-                param_data[expert_id, shard_size:2 *
-                           shard_size, :] = loaded_weight[shard, :]
-                if current_platform.is_hpu():
-                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
-                        param_data[expert_id])
-            # w2, down_proj case: Load into only shard of w2.
-            elif shard_id == 1:
-                param_data[expert_id, :, :] = loaded_weight[:, shard]
-                if current_platform.is_hpu():
-                    self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
-                        param_data[expert_id])
-            else:
-                raise ValueError(
-                    f"Shard id must be in [0,1,2] but got {shard_id}")

From 401f5ae3d339b1b0402b7f276905ef28d4ba0b21 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 15 Oct 2024 12:05:57 +0200
Subject: [PATCH 322/341] [CI] Temporarily increase test tolerances (#392)

This PR raises the allowed relative tolerance in GSM8K to 0.06, and
moves Llama-70B test to 4xG2 from 2xG2 until memory usage is
investigated (success run: vLLM-CI-Pipeline/206)
---
 .jenkins/lm-eval-harness/test_lm_eval_correctness.py | 2 +-
 .jenkins/test_config.yaml                            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
index 9c6d0ee48caf5..421a949ab72e5 100644
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,7 +19,7 @@
 
 import vllm
 
-RTOL = 0.05
+RTOL = 0.06
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
index 99ff97df8cd34..f90cdb354d4f5 100644
--- a/.jenkins/test_config.yaml
+++ b/.jenkins/test_config.yaml
@@ -19,6 +19,6 @@ stages:
       - name: gsm8k_large_g3_tp2
         flavor: g3.s
         command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
-      - name: gsm8k_large_g2_tp2
-        flavor: g2.s
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
+      - name: gsm8k_large_g2_tp4
+        flavor: g2.m
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4

From e598f3f125a50326e8f187ce59b096129aab40eb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Tue, 15 Oct 2024 12:07:00 +0200
Subject: [PATCH 323/341] Add quickstart section to READMEs (#391)

---
 README_GAUDI.md                               | 21 ++++++++++++---
 .../getting_started/gaudi-installation.rst    | 27 ++++++++++++++++---
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/README_GAUDI.md b/README_GAUDI.md
index 555cd1738b909..b9c744bd9e23f 100644
--- a/README_GAUDI.md
+++ b/README_GAUDI.md
@@ -4,7 +4,7 @@ This README provides instructions on running vLLM with Intel Gaudi devices.
 
 # Requirements and Installation
 
-Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
 ## Requirements
 
@@ -13,18 +13,31 @@ Please follow the instructions provided in the [Gaudi Installation Guide](https:
 - Intel Gaudi accelerator
 - Intel Gaudi software version 1.18.0
 
+## Quick start using Dockerfile
+```
+$ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+```
+
+> [!TIP]
+> If you're facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered correctly.
+
+
+## Build from source
+
+### Environment verification
 To verify that the Intel Gaudi software was correctly installed, run:
 
 ```{.console}
 $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
+$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
 $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
 $ pip list | grep neural # verify that neural-compressor is installed
 ```
 
 Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details.
 
-## Run Docker Image
+### Run Docker Image
 
 It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details.
 
@@ -35,7 +48,7 @@ $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch
 $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
-## Build and Install vLLM
+### Build and Install vLLM-fork
 
 Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
 
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index 111bab2494990..c943625a65f29 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -8,8 +8,8 @@ Requirements and Installation
 
 Please follow the instructions provided in the `Gaudi Installation
 Guide <https://docs.habana.ai/en/latest/Installation_Guide/index.html>`__
-to set up the environment. To achieve the best performance, please
-follow the methods outlined in the `Optimizing Training Platform
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the `Optimizing Training Platform
 Guide <https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html>`__.
 
 Requirements
@@ -20,12 +20,31 @@ Requirements
 -  Intel Gaudi accelerator
 -  Intel Gaudi software version 1.18.0
 
+
+Quick start using Dockerfile
+============================
+.. code:: console
+
+   $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+   $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+
+
+.. tip::
+   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html`__. Make sure you have ``habana-container-runtime`` package installed and that ```habana`` container runtime is registered.
+
+
+Build from source
+=================
+
+Environment verification
+------------------------
+
 To verify that the Intel Gaudi software was correctly installed, run:
 
 .. code:: console
 
    $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed
+   $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
    $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
    $ pip list | grep neural # verify that neural_compressor is installed
 
@@ -210,7 +229,7 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
    Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 
 Warmup
-------------
+------
 
 Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
 

From f77435d44bf6f8fc129ffc58fc206d4d7bc8a81f Mon Sep 17 00:00:00 2001
From: Michal Adamczyk <madamczyk@habana.ai>
Date: Wed, 16 Oct 2024 09:40:20 +0200
Subject: [PATCH 324/341] Softmax: add weighted-sum normalization (#378)

Supporting PR for https://github.com/HabanaAI/vllm-hpu-extension/pull/10
---
 requirements-hpu.txt                 |  2 +-
 vllm/attention/backends/hpu_attn.py  |  1 +
 vllm/attention/ops/hpu_paged_attn.py |  1 +
 vllm/worker/hpu_model_runner.py      | 45 +++++++++++++++++++++-------
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 8495d63ce72fa..1a583974be151 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@7531cc6
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 17201fe6e1cd6..a8f4b09b67274 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -222,6 +222,7 @@ def forward(
                 block_list=attn_metadata.block_list,
                 block_mapping=attn_metadata.block_mapping,
                 block_bias=attn_metadata.attn_bias,
+                block_scales=attn_metadata.block_scales,
                 scale=self.scale,
                 matmul_qk_op=self.matmul_qk,
                 matmul_av_op=self.matmul_av,
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index 7fbe26d83f320..4c0fb2a628361 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -20,6 +20,7 @@ class HPUPagedAttentionMetadata:
     block_usage: Optional[torch.Tensor]
     block_indices: Optional[torch.Tensor]
     block_offsets: Optional[torch.Tensor]
+    block_scales: Optional[torch.Tensor]
 
 
 class HPUPagedAttention:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f81e4aa59b289..d8150a56844a2 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -298,9 +298,19 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
         mask = mask >= metadata.block_usage.unsqueeze(-1)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
-        block_mapping = torch.nn.functional.one_hot(
-            metadata.block_mapping.to(torch.long),
-            num_classes=batch_size).to(dtype)
+        if is_fake_hpu():
+            # Unfortunately one_hot on CPU doesn't handle
+            # out of bounds classes. We need to mask those
+            # values manually
+            oob_values = metadata.block_mapping.lt(0)
+            block_mapping = metadata.block_mapping.masked_fill(oob_values, 0)
+            block_mapping = torch.nn.functional.one_hot(block_mapping,
+                                                        num_classes=batch_size)
+            block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
+        else:
+            block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
+                                                        num_classes=batch_size)
+        block_mapping = block_mapping.to(dtype)
         metadata = metadata._replace(block_mapping=block_mapping,
                                      attn_bias=attn_bias)
         return metadata
@@ -873,6 +883,7 @@ def _prepare_prompt(
             block_usage=None,
             block_indices=block_indices,
             block_offsets=block_offsets,
+            block_scales=None,
             attn_bias=None,
             seq_lens_tensor=seq_lens_tensor,
             num_prefills=real_num_seqs,
@@ -968,7 +979,15 @@ def _prepare_decode(
         num_decode_tokens = sum(seq_lens)
 
         blocks_used = [len(bt) for bt in block_tables if bt]
-        block_list = list(itertools.chain(*block_tables))
+        block_list = []
+        block_scales = []
+        for i, bt in enumerate(block_tables):
+            block_list.extend(bt)
+            blocks_in_group = len(bt)
+            if blocks_in_group > 0:
+                scale = 1.0 / blocks_in_group
+                block_scales.extend([scale] * blocks_in_group)
+
         block_mapping_nested: List[List[int]] = [
             [i] * b_u for i, b_u in enumerate(blocks_used)
         ]
@@ -984,18 +1003,19 @@ def _prepare_decode(
 
         block_bucket_size = find_bucket(len(block_list),
                                         self.decode_block_bucket_cfg)
-        block_list = pad_list(block_list, block_bucket_size, _PAD_SLOT_ID)
-        block_mapping = pad_list(block_mapping, block_bucket_size, 0)
-        block_usage = pad_list(block_usage, block_bucket_size, 0)
+        block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
+        block_mapping = pad_list(block_mapping, block_bucket_size, -1)
+        block_usage = pad_list(block_usage, block_bucket_size, 1)
+        block_scales = pad_list(block_scales, block_bucket_size, 0.0)
 
         block_list = torch.tensor(block_list,
                                   dtype=torch.int,
                                   device=self.device)
         block_mapping = torch.tensor(block_mapping,
-                                     dtype=torch.int,
+                                     dtype=torch.long,
                                      device=self.device)
         block_usage = torch.tensor(block_usage,
-                                   dtype=torch.bfloat16,
+                                   dtype=self.model_config.dtype,
                                    device=self.device)
 
         slot_mapping = torch.tensor(slot_mapping,
@@ -1004,6 +1024,10 @@ def _prepare_decode(
 
         block_indices, block_offsets = precompute_indices_and_offsets(
             self.block_size, slot_mapping, False)
+        block_scales = torch.tensor(block_scales,
+                                    dtype=self.model_config.dtype,
+                                    device=self.device)
+
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             block_list=block_list,
@@ -1011,6 +1035,7 @@ def _prepare_decode(
             block_usage=block_usage,
             block_indices=block_indices,
             block_offsets=block_offsets,
+            block_scales=block_scales,
             attn_bias=None,
             seq_lens_tensor=None,
             num_prefills=0,
@@ -1222,7 +1247,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
             'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
             'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
-            'block_offsets'
+            'block_offsets', 'block_scales'
         ])
         return attention_metadata
 

From 2fa46cdeea611db7fa4cc465b8d5c17465807888 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 19:04:17 +0300
Subject: [PATCH 325/341] remove jenkins files

---
 .github/workflows/reminder_comment.yml        |   2 +-
 .../configs/Meta-Llama-3-70B-Instruct.yaml    |  12 --
 .../configs/Meta-Llama-3-8B-Instruct.yaml     |  12 --
 .../configs/Meta-Llama-3.1-8B-Instruct.yaml   |  15 --
 .../lm-eval-harness/configs/models-large.txt  |   1 -
 .../lm-eval-harness/configs/models-small.txt  |   2 -
 .../run-lm-eval-gsm-vllm-baseline.sh          |  51 -----
 .jenkins/lm-eval-harness/run-tests.sh         |  69 -------
 .../test_lm_eval_correctness.py               | 183 ------------------
 .jenkins/requirements-test-hpu.txt            |   2 -
 .jenkins/test_config.yaml                     |  24 ---
 .jenkins/test_config_t_compile.yaml           |  16 --
 12 files changed, 1 insertion(+), 388 deletions(-)
 delete mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
 delete mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
 delete mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
 delete mode 100644 .jenkins/lm-eval-harness/configs/models-large.txt
 delete mode 100644 .jenkins/lm-eval-harness/configs/models-small.txt
 delete mode 100644 .jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
 delete mode 100644 .jenkins/lm-eval-harness/run-tests.sh
 delete mode 100644 .jenkins/lm-eval-harness/test_lm_eval_correctness.py
 delete mode 100644 .jenkins/requirements-test-hpu.txt
 delete mode 100644 .jenkins/test_config.yaml
 delete mode 100644 .jenkins/test_config_t_compile.yaml

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 99827756d2066..d1791c3bc865a 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
+        uses: actions/github-script@v7
         with:
           script: |
             github.rest.issues.createComment({
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
deleted file mode 100644
index 38965c6197c55..0000000000000
--- a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
-model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-70B-Instruct"
-tasks:
-- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.892
-  - name: "exact_match,flexible-extract"
-    value: 0.892
-limit: 250
-num_fewshot: 5
-dtype: "bfloat16"
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
deleted file mode 100644
index 9fe7d634b887b..0000000000000
--- a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
-model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct"
-tasks:
-- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.756
-  - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
-num_fewshot: 5
-dtype: "bfloat16"
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
deleted file mode 100644
index e2458a8ea4f1c..0000000000000
--- a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF
-# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1
-model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct"
-tasks:
-- name: "gsm8k_cot_llama"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.8317
-  - name: "exact_match,flexible-extract"
-    value: 0.8355
-limit: null
-num_fewshot: 8
-dtype: "bfloat16"
-fewshot_as_multiturn: true
-apply_chat_template: true
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/models-large.txt b/.jenkins/lm-eval-harness/configs/models-large.txt
deleted file mode 100644
index ca2548d1234a8..0000000000000
--- a/.jenkins/lm-eval-harness/configs/models-large.txt
+++ /dev/null
@@ -1 +0,0 @@
-Meta-Llama-3-70B-Instruct.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/configs/models-small.txt b/.jenkins/lm-eval-harness/configs/models-small.txt
deleted file mode 100644
index d8ae241e58ad3..0000000000000
--- a/.jenkins/lm-eval-harness/configs/models-small.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3.1-8B-Instruct.yaml
\ No newline at end of file
diff --git a/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
deleted file mode 100644
index 65128d6b437e1..0000000000000
--- a/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.3
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -b    - batch size to run the evaluation at"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    b ) 
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    f ) 
-        FEWSHOT="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096,dtype=bfloat16 \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh
deleted file mode 100644
index 09d507d404ede..0000000000000
--- a/.jenkins/lm-eval-harness/run-tests.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:j:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-    export VLLM_SKIP_WARMUP=true
-    RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
-    JUNIT_SUFFIX=""
-    if [[ -n "$TEST_RESULTS_DIR" ]]; then
-        LOG_DIR=$TEST_RESULTS_DIR
-        LOG_FILENAME="$test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
-        LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
-        JUNIT_SUFFIX="-o junit_family=xunit1 --junitxml=${LOG_PATH}"
-    fi
-    pytest -s test_lm_eval_correctness.py $JUNIT_SUFFIX || LOCAL_SUCCESS=$?
-
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
deleted file mode 100644
index 421a949ab72e5..0000000000000
--- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""
-LM eval harness on model to compare vs HF baseline computed offline.
-Configs are found in configs/$MODEL.yaml
-
-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4 
-* pytest -s test_lm_eval_correctness.py
-"""
-import atexit
-import itertools
-import os
-import statistics
-import time
-from pathlib import Path
-
-import lm_eval
-import numpy
-import yaml
-
-import vllm
-
-RTOL = 0.06
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-
-
-def fail_on_exit():
-    os._exit(1)
-
-
-def launch_lm_eval(eval_config):
-    trust_remote_code = eval_config.get('trust_remote_code', False)
-    dtype = eval_config.get('dtype', 'bfloat16')
-    max_num_seqs = eval_config.get('max_num_seqs', 128)
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true," \
-                 f"dtype={dtype}," \
-                 f"max_model_len=4096," \
-                 f"max_num_seqs={max_num_seqs}," \
-                 f"trust_remote_code={trust_remote_code}"
-    kwargs = {}
-    if 'fewshot_as_multiturn' in eval_config:
-        kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn']
-    if 'apply_chat_template' in eval_config:
-        kwargs['apply_chat_template'] = eval_config['apply_chat_template']
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        batch_size="auto",
-        **kwargs)
-
-    return results
-
-
-def report_performance(task, input_lens, output_lens, time, record_property):
-    assert len(input_lens) == len(output_lens)
-    context_lens = [i + o for i, o in zip(input_lens, output_lens)]
-    gen_tput = sum(output_lens) / time
-    all_lens = [input_lens, output_lens, context_lens]
-    min_input_tokens, min_output_tokens, min_context_tokens = [
-        min(x) for x in all_lens
-    ]
-    max_input_tokens, max_output_tokens, max_context_tokens = [
-        max(x) for x in all_lens
-    ]
-    mean_input_tokens, mean_output_tokens, mean_context_tokens = [
-        statistics.mean(x) for x in all_lens
-    ]
-    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [
-        statistics.stdev(x) for x in all_lens
-    ]
-    msg = (
-        f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004, E501
-        f'{task} | input_tokens   | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n'  # noqa: E501
-        f'{task} | output_tokens  | min: {min_output_tokens} | max: {max_output_tokens} | mean: {mean_output_tokens:.2f} | stddev: {stddev_output_tokens:.2f}\n'  # noqa: E501
-        f'{task} | context_length | min: {min_context_tokens} | max: {max_context_tokens} | mean: {mean_context_tokens:.2f} | stddev: {stddev_context_tokens:.2f}'  # noqa: E501
-    )
-
-    # Log all of these stats to JUnitXML
-    record_property(f"{task}_gen_tput", gen_tput)
-    record_property(f"{task}_input_tokens_min", min_input_tokens)
-    record_property(f"{task}_input_tokens_max", max_input_tokens)
-    record_property(f"{task}_input_tokens_mean", mean_input_tokens)
-    record_property(f"{task}_input_tokens_stddev", stddev_input_tokens)
-
-    record_property(f"{task}_output_tokens_min", min_output_tokens)
-    record_property(f"{task}_output_tokens_max", max_output_tokens)
-    record_property(f"{task}_output_tokens_mean", mean_output_tokens)
-    record_property(f"{task}_output_tokens_stddev", stddev_output_tokens)
-
-    record_property(f"{task}_context_tokens_min", min_context_tokens)
-    record_property(f"{task}_context_tokens_max", max_context_tokens)
-    record_property(f"{task}_context_tokens_mean", mean_context_tokens)
-    record_property(f"{task}_context_tokens_stddev", stddev_context_tokens)
-
-    print(msg)
-
-
-def get_current_gaudi_platform():
-    """
-    Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
-    """
-    import habana_frameworks.torch.utils.experimental as htexp
-
-    device_type = htexp._get_device_type()
-
-    if device_type == htexp.synDeviceType.synDeviceGaudi:
-        return "Gaudi1"
-    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
-        return "Gaudi2"
-    elif device_type == htexp.synDeviceType.synDeviceGaudi3:
-        return "Gaudi3"
-    else:
-        raise ValueError(
-            f"Unsupported device: the device type is {device_type}.")
-
-
-def test_lm_eval_correctness(record_xml_attribute, record_property):
-    try:
-        eval_config = yaml.safe_load(
-            Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-
-        # Record JUnitXML test name
-        tasks_str = '_'.join([t['name'] for t in eval_config["tasks"]])
-        platform = get_current_gaudi_platform()
-        testname = (f'test_{Path(TEST_DATA_FILE).stem}_{tasks_str}_{platform}_'
-                    f'tp{TP_SIZE}')
-        record_xml_attribute("name", testname)
-
-        # Launch eval requests.
-        start_time = time.perf_counter()
-        results = launch_lm_eval(eval_config)
-        total_time = time.perf_counter() - start_time
-
-        tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer(
-            eval_config['model_name'])
-
-        # Confirm scores match ground truth.
-        for task in eval_config["tasks"]:
-
-            samples = results['samples'][task["name"]]
-            tokenized_inputs = [
-                tokenizer(x['arguments'][0][0])['input_ids'] for x in samples
-            ]
-            tokenized_inputs_lens = [len(x) for x in tokenized_inputs]
-            tokenized_outputs = [
-                list(
-                    itertools.chain.from_iterable(
-                        tokenizer(
-                            list(itertools.chain.from_iterable(
-                                x['resps'])))['input_ids'])) for x in samples
-            ]
-            tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
-            report_performance(task['name'], tokenized_inputs_lens,
-                               tokenized_outputs_lens, total_time,
-                               record_property)
-
-            for metric in task["metrics"]:
-                ground_truth = metric["value"]
-                measured_value = results["results"][task["name"]][
-                    metric["name"]]
-                print(
-                    f'{task["name"]} | {metric["name"]}: '
-                    f'ground_truth={ground_truth} | measured={measured_value}')
-
-                # Record ground truth and measured value to JUnitXML
-                record_property(
-                    f"{task['name']}_{metric['name']}_ground_truth",
-                    ground_truth)
-                record_property(f"{task['name']}_{metric['name']}_measured",
-                                measured_value)
-                assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
-    except Exception as exc:
-        # nasty workaround for a nasty HPU PT bridge bug (SW-204785)
-        atexit.register(fail_on_exit)
-        raise exc
diff --git a/.jenkins/requirements-test-hpu.txt b/.jenkins/requirements-test-hpu.txt
deleted file mode 100644
index e0710d3775957..0000000000000
--- a/.jenkins/requirements-test-hpu.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-lm_eval
-pytest
\ No newline at end of file
diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
deleted file mode 100644
index f90cdb354d4f5..0000000000000
--- a/.jenkins/test_config.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# test_config.yaml
-stages:
-  - name: test_gsm8k_small_models
-    steps:
-      - name: gsm8k_small_g3_tp1
-        flavor: g3
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
-      - name: gsm8k_small_g3_tp2
-        flavor: g3.s
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
-      - name: gsm8k_small_g2_tp1
-        flavor: g2
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 1
-      - name: gsm8k_small_g2_tp2
-        flavor: g2.s
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-small.txt -t 2
-  - name: test_gsm8k_large_models
-    steps:
-      - name: gsm8k_large_g3_tp2
-        flavor: g3.s
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2
-      - name: gsm8k_large_g2_tp4
-        flavor: g2.m
-        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4
diff --git a/.jenkins/test_config_t_compile.yaml b/.jenkins/test_config_t_compile.yaml
deleted file mode 100644
index 58fcb45a7edfb..0000000000000
--- a/.jenkins/test_config_t_compile.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# test_config_t_compile.yaml
-stages:
-  - name: test_gsm8k_small_models_tcompile
-    steps:
-      - name: gsm8k_small_g3_tp1_tc
-        flavor: g3
-        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
-      - name: gsm8k_small_g3_tp2_tc
-        flavor: g3.s
-        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2
-      - name: gsm8k_small_g2_tp1_tc
-        flavor: g2
-        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1
-      - name: gsm8k_small_g2_tp2_tc
-        flavor: g2.s
-        command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2

From 3683db636105bca30bcf675df7b99073998008b5 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 19:06:11 +0300
Subject: [PATCH 326/341] restore README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b0ecdc6b6057d..72c3273edc61d 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="README_GAUDI.md"><b>Intel® Gaudi® README</b></a> | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 
From 91af5dac6a957fe54052c525c7f021d33edb61cc Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 19:08:46 +0300
Subject: [PATCH 327/341] remove fakehpu

---
 vllm/worker/hpu_model_runner.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2634ba40ab122..b3bd6ab1a447a 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -296,18 +296,8 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
         mask = mask >= metadata.block_usage.unsqueeze(-1)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
-        if is_fake_hpu():
-            # Unfortunately one_hot on CPU doesn't handle
-            # out of bounds classes. We need to mask those
-            # values manually
-            oob_values = metadata.block_mapping.lt(0)
-            block_mapping = metadata.block_mapping.masked_fill(oob_values, 0)
-            block_mapping = torch.nn.functional.one_hot(block_mapping,
-                                                        num_classes=batch_size)
-            block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
-        else:
-            block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
-                                                        num_classes=batch_size)
+        block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
+                                                    num_classes=batch_size)
         block_mapping = block_mapping.to(dtype)
         metadata = metadata._replace(block_mapping=block_mapping,
                                      attn_bias=attn_bias)

From d2ce468445e8781f210d201f9a26b94df3630d22 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 19:20:53 +0300
Subject: [PATCH 328/341] use sentinel in model runner base WA

---
 vllm/worker/model_runner_base.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 27919fdb75cd0..434250ce65348 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -4,7 +4,7 @@
 from datetime import datetime
 from functools import wraps
 from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List,
-                    Optional, Type, TypeVar, Union, get_args, get_origin)
+                    Optional, Type, TypeVar)
 
 import torch
 from torch import is_tensor
@@ -46,14 +46,14 @@ def _init_attn_metadata_from_tensor_dict(
     # Extract the fields used to create AttentionMetadata.
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
-        val = tensor_dict.pop(field.name, None)
-        # NOTE(kzawora): None is a valid value if type is optional. If
+        # NOTE(kzawora): We use sentinel here, as None
+        # may be a valid value if type is optional. If
         # we don't check against it, we will crash by not assigning
         # Optional types without default value, even if they are
         # broadcasted properly.
-        is_field_optional = get_origin(field.type) is Union and \
-            type(None) in get_args(field.type)
-        if val is not None or (val is None and is_field_optional):
+        sentinel = object()
+        val = tensor_dict.pop(field.name, sentinel)
+        if val == sentinel:
             valid_attn_kwargs[field.name] = val
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata

From b6428cdfac630252f91e7239b5b5f85b2e489ebb Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 20:07:22 +0300
Subject: [PATCH 329/341] remove leftovers from habana_main

---
 vllm/engine/multiprocessing/engine.py                  | 10 ++++------
 .../layers/quantization/utils/w8a8_utils.py            |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 2092fcedf2c3f..2bf0ce83c7607 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -23,7 +23,6 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.hpu_executor import HPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -33,6 +32,7 @@
 
 logger = init_logger(__name__)
 
+POLLING_TIMEOUT_MS = 10000
 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
 
 
@@ -209,7 +209,7 @@ def run_engine_loop(self):
             self._alive()
             if not self.engine.has_unfinished_requests():
                 # Poll until there is work to do.
-                while self.input_socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
+                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
                     self._alive()
                     self.engine.do_log_stats()
                     logger.debug("Waiting for new requests in engine loop.")
@@ -368,15 +368,13 @@ def _alive(self):
         self._last_alive_time = time.time()
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index eb8dceec0acf4..411af922149fd 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -121,7 +121,7 @@ def apply_fp8_linear(
         qinput, x_scale = ops.scaled_fp8_quant(
             input,
             input_scale,
-            batch_dim_padding=17,
+            num_token_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)

From 51492786fc8cb1cdf47c70f66fcb38f6c2ddcbaf Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 20:08:19 +0300
Subject: [PATCH 330/341] remove leftovers from habana_main

---
 vllm/model_executor/model_loader/loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 055926b9a0626..813f58339da37 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -98,8 +98,8 @@ def _get_quantization_config(
     """Get the quantization config."""
     if model_config.quantization is not None:
         quant_config = get_quant_config(model_config, load_config)
-        capability_tuple = current_platform.get_device_capability() \
-                            if current_platform.is_cuda_alike() else None
+        capability_tuple = current_platform.get_device_capability()
+
         if capability_tuple is not None:
             capability = capability_tuple.to_int()
             if capability < quant_config.get_min_capability():

From f4b356f044929c637abd4adfb6788def0ead5c77 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 20:20:56 +0300
Subject: [PATCH 331/341] remove HPUExecutorAsync import

---
 vllm/engine/async_llm_engine.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index eebe7e6cef8ac..5242ce23f8493 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -17,7 +17,6 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
-from vllm.executor.hpu_executor import HPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TokensPrompt
 from vllm.logger import init_logger
@@ -1303,8 +1302,7 @@ def remove_logger(self, logger_name: str) -> None:
     async def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
@@ -1312,8 +1310,7 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync or \
-            type(self.engine.model_executor) == HPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")

From 3eee00d640b14debd32b73823b8f631f5a5b603a Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 20:24:39 +0300
Subject: [PATCH 332/341] remove hpu fused_moe

---
 vllm/model_executor/layers/fused_moe/layer.py | 93 +++++++------------
 1 file changed, 36 insertions(+), 57 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 457450cda2ce6..bce740d0db750 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -12,9 +12,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
-
-is_hpu = current_platform.is_hpu()
 
 logger = init_logger(__name__)
 
@@ -121,25 +118,6 @@ def forward_cuda(
                              topk_ids=topk_ids,
                              inplace=True)
 
-    def forward_hpu(self,
-                    layer: torch.nn.Module,
-                    x: torch.Tensor,
-                    use_grouped_topk: bool,
-                    top_k: int,
-                    router_logits: torch.Tensor,
-                    renormalize: bool,
-                    topk_group: Optional[int] = None,
-                    num_expert_group: Optional[int] = None,
-                    custom_routing_function: Optional[Callable] = None):
-        assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
-        assert num_expert_group is None, ('num_expert_group is '
-                                          'not supported on HPU')
-        assert topk_group is None, 'topk_group is not supported on HPU'
-        if layer is not None:
-            return layer.hpu_static_fused_moe(x, layer.w13_weight,
-                                              layer.w2_weight, router_logits,
-                                              top_k)
-
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
             "The CPU backend currently does not support MoE.")
@@ -173,7 +151,7 @@ def forward_tpu(
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
-    This layer contains both MergedColumnParallel weights (gate_up_proj /
+    This layer contains both MergedColumnParallel weights (gate_up_proj / 
     w13) and RowParallelLinear weights (down_proj/ w2).
 
     Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
@@ -226,9 +204,6 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
-        if current_platform.is_hpu():
-            from vllm_hpu_extension.ops import StaticFusedMOE
-            self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -264,7 +239,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                                                  expert_data: torch.Tensor,
                                                  shard_id: str,
                                                  loaded_weight: torch.tensor,
-                                                 tp_rank: int, expert_id: int):
+                                                 tp_rank: int):
         # Load grouped weight scales for group quantization
         # or model weights
         if shard_id == "w2":
@@ -272,15 +247,13 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int,
                           shard_dim=shard_dim,
                           loaded_weight=loaded_weight,
                           expert_data=expert_data,
-                          tp_rank=tp_rank,
-                          expert_id=expert_id)
+                          tp_rank=tp_rank)
         elif shard_id in ("w1", "w3"):
             self._load_w13(shard_id=shard_id,
                            shard_dim=shard_dim,
                            loaded_weight=loaded_weight,
                            expert_data=expert_data,
-                           tp_rank=tp_rank,
-                           expert_id=expert_id)
+                           tp_rank=tp_rank)
 
     def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                                        shard_dim: int, shard_id: str,
@@ -296,15 +269,9 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor,
                            expert_data=expert_data,
                            tp_rank=tp_rank)
 
-    def _load_w13(self,
-                  expert_data: torch.Tensor,
-                  shard_dim: int,
-                  shard_id: str,
-                  loaded_weight: torch.tensor,
-                  tp_rank: int,
-                  expert_id: Optional[int] = None):
+    def _load_w13(self, expert_data: torch.Tensor, shard_dim: int,
+                  shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
 
-        orig_exp_data = expert_data.view(expert_data.size())
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
         shard_size = expert_data.shape[shard_dim] // 2
@@ -320,17 +287,8 @@ def _load_w13(self,
             expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
         expert_data.copy_(loaded_weight)
 
-        if is_hpu:
-            self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
-                orig_exp_data)
-
-    def _load_w2(self,
-                 expert_data: torch.Tensor,
-                 shard_dim: int,
-                 shard_id: str,
-                 loaded_weight: torch.tensor,
-                 tp_rank: int,
-                 expert_id: Optional[int] = None):
+    def _load_w2(self, expert_data: torch.Tensor, shard_dim: int,
+                 shard_id: str, loaded_weight: torch.tensor, tp_rank: int):
 
         # Index the loaded weight for tp sharding.
         # down_proj: "RowParallel" so tp sharding on input_dim
@@ -340,9 +298,6 @@ def _load_w2(self,
                                              shard_size)
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
-        if is_hpu:
-            self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
-                expert_data)
 
     def _load_single_value(self, param: torch.nn.Parameter,
                            loaded_weight: torch.Tensor, expert_id: int):
@@ -445,8 +400,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank,
-                    expert_id=expert_id)
+                    tp_rank=tp_rank)
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                 self._load_per_tensor_weight_scale(shard_id=shard_id,
                                                    param=param,
@@ -472,8 +426,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                 shard_dim=shard_dim,
                 loaded_weight=loaded_weight,
                 expert_data=expert_data,
-                tp_rank=tp_rank,
-                expert_id=expert_id)
+                tp_rank=tp_rank)
             return
 
     @staticmethod
@@ -552,3 +505,29 @@ def make_expert_params_mapping(
                 ("w3", ckpt_up_proj_name),
             ]
         ]
+
+    def _load_fp8_scale(self, param: torch.nn.Parameter,
+                        loaded_weight: torch.Tensor, weight_name: str,
+                        shard_id: str, expert_id: int) -> None:
+        param_data = param.data
+
+        # Input scales can be loaded directly and should be equal.
+        if "input_scale" in weight_name:
+            if param_data[expert_id] != 1 and (param_data[expert_id] -
+                                               loaded_weight).abs() > 1e-5:
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param_data[expert_id]} "
+                    f"vs. {loaded_weight}")
+            param_data[expert_id] = loaded_weight
+        # Weight scales
+        elif "weight_scale" in weight_name:
+            # If we are in merged column case (gate_up_proj)
+            if shard_id in ("w1", "w3"):
+                # We have to keep the weight scales of w1 and w3 because
+                # we need to re-quantize w1/w3 weights after weight loading.
+                idx = 0 if shard_id == "w1" else 1
+                param_data[expert_id][idx] = loaded_weight
+            # If we are in the row parallel case (down_proj)
+            else:
+                param_data[expert_id] = loaded_weight

From a59fc7b481b1807f27de1165383b6e10476850d2 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 19:30:58 +0200
Subject: [PATCH 333/341] Remove HPU changes from cache_engine.py (#400)

We were asked on upstream PR to remove our changes from cache_engine.py.
This PR does just that, and creates HPUCacheEngine inheriting from
CacheEngine, just overriding _allocate_kv_cache method.
---
 vllm/worker/cache_engine.py | 30 +++++++++---------------------
 vllm/worker/hpu_worker.py   | 35 +++++++++++++++++++++++++++++------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 9618585c8acb0..090f95e6e892c 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu,
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
                         is_pin_memory_available)
 
 logger = init_logger(__name__)
@@ -75,26 +75,14 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
         for _ in range(self.num_attention_layers):
-            if device == 'hpu' or is_fake_hpu():
-                key_cache = torch.zeros(kv_cache_shape,
-                                        dtype=self.dtype,
-                                        device=device)
-                value_cache = torch.zeros(kv_cache_shape,
-                                          dtype=self.dtype,
-                                          device=device)
-                kv_layer = (key_cache, value_cache)
-                kv_cache.append(kv_layer)
-            else:
-                # null block in CpuGpuBlockAllocator requires at least that
-                # block to be zeroed-out.
-                # We zero-out everything for simplicity.
-                dtype = torch.uint8 if self.dtype == torch.float8_e4m3fn else \
-                        self.dtype
-                kv_cache.append(
-                    torch.zeros(kv_cache_shape,
-                                dtype=dtype,
-                                pin_memory=pin_memory,
-                                device=device))
+            # null block in CpuGpuBlockAllocator requires at least that
+            # block to be zeroed-out.
+            # We zero-out everything for simplicity.
+            kv_cache.append(
+                torch.zeros(kv_cache_shape,
+                            dtype=self.dtype,
+                            pin_memory=pin_memory,
+                            device=device))
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 59a5adf65ebc1..752388e0d632f 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -93,7 +93,7 @@ def __init__(
             observability_config=observability_config)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
-        self.cache_engine: List[CacheEngine]
+        self.cache_engine: List[HPUCacheEngine]
         # Initialize gpu_cache as embedding models don't initialize kv_caches
         self.hpu_cache: Optional[List[List[torch.tensor]]] = None
         # Torch profiler. Enabled and configured through env vars:
@@ -242,8 +242,8 @@ def initialize_cache(self, num_gpu_blocks: int,
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
         self.cache_engine = [
-            CacheEngine(self.cache_config, self.model_config,
-                        self.parallel_config, self.device_config)
+            HPUCacheEngine(self.cache_config, self.model_config,
+                           self.parallel_config, self.device_config)
             for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
         self.hpu_cache = [
@@ -358,9 +358,9 @@ def vocab_size(self) -> int:
     def get_cache_block_size_bytes(self) -> int:
         """Get the size of the KV cache block size in bytes.
         """
-        return CacheEngine.get_cache_block_size(self.cache_config,
-                                                self.model_config,
-                                                self.parallel_config)
+        return HPUCacheEngine.get_cache_block_size(self.cache_config,
+                                                   self.model_config,
+                                                   self.parallel_config)
 
 
 def init_worker_distributed_environment(
@@ -423,3 +423,26 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
             f"stored in KV cache ({max_seq_len}). Try increasing "
             "`gpu_memory_utilization` or decreasing `max_model_len` when "
             "initializing the engine.")
+
+
+class HPUCacheEngine(CacheEngine):
+
+    def _allocate_kv_cache(
+        self,
+        num_blocks: int,
+        device: str,
+    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+        """Allocates KV cache on the specified device."""
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        for _ in range(self.num_attention_layers):
+            key_cache = torch.zeros(kv_cache_shape,
+                                    dtype=self.dtype,
+                                    device=device)
+            value_cache = torch.zeros(kv_cache_shape,
+                                      dtype=self.dtype,
+                                      device=device)
+            kv_layer = (key_cache, value_cache)
+            kv_cache.append(kv_layer)
+        return kv_cache

From 8b6e30d61d8745bf0d4befc423d9817b1853b0d7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 16 Oct 2024 20:52:38 +0300
Subject: [PATCH 334/341] remove hpuexecutor import

---
 vllm/engine/llm_engine.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eb760405e5f2a..380110c1c5986 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -28,7 +28,6 @@
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.hpu_executor import HPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
                          InputRegistry, LLMInputs, PromptType)
@@ -1776,8 +1775,7 @@ def check_health(self) -> None:
     def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor:  # noqa: E721
             self.model_executor.start_profile()
         else:
             self.model_executor._run_workers("start_profile")
@@ -1785,8 +1783,7 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor or \
-            type(self.model_executor) == HPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor:  # noqa: E721
             self.model_executor.stop_profile()
         else:
             self.model_executor._run_workers("stop_profile")

From 05bcdf5e169be9d746ff4c9d6163fff9f4b310b9 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 12:18:10 +0200
Subject: [PATCH 335/341] [bucketing overhaul 1/n] Add padding-aware scheduling
 and option to limit prefill batch size (#394)

This PR adds following functionality that can be enabled via engine
flags:
- use_padding_aware_scheduling - vLLM scheduler will now calculate token
cost considering padded prefill shape (similar to
https://github.com/HabanaAI/vllm-fork/pull/109).
- max_num_prefill_seqs - padding-aware scheduler will perform an
additional check for prefill batch size and will effectively limit
prefill batch size at maximum of `max_num_prefill_seqs`. If unset, max
prefill batch size will be `max_num_seqs`.
Both features are generic and do not require HPU, although they may be
specialized for particular vendor's usage. Padding aware scheduling
includes padding function selector which selects HPU padding function
(considering currently used HPU buckets) if current device is HPU.
Otherwise, it will take a product of batch_size x max_seq_len.
---
 vllm/config.py                  |  18 ++++-
 vllm/core/scheduler.py          | 122 ++++++++++++++++++++++++++--
 vllm/engine/arg_utils.py        |  19 ++++-
 vllm/worker/hpu_model_runner.py | 137 ++++++++++++++++++++------------
 4 files changed, 238 insertions(+), 58 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5499b349bcfc8..67a4ec0761cc3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -940,6 +940,9 @@ class SchedulerConfig:
             a single iteration.
         max_num_seqs: Maximum number of sequences to be processed in a single
             iteration.
+        max_num_prefill_seqs: Maximum number of prefill sequences to be
+             processed in a single iteration. Used only with padding-aware 
+             scheduling.
         max_model_len: Maximum length of a sequence (including prompt
             and generated text).
         use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
@@ -963,11 +966,14 @@ class SchedulerConfig:
             when SPMD worker architecture is enabled. I.e.,
             VLLM_USE_RAY_SPMD_WORKER=1
         policy: The scheduling policy to use. "fcfs" (default) or "priority".
+        use_padding_aware_scheduling: If True, scheduler will consider padded
+            tokens in prefill.
     """
 
     def __init__(self,
                  max_num_batched_tokens: Optional[int],
                  max_num_seqs: int,
+                 max_num_prefill_seqs: Optional[int],
                  max_model_len: int,
                  use_v2_block_manager: bool = True,
                  num_lookahead_slots: int = 0,
@@ -979,7 +985,8 @@ def __init__(self,
                  num_scheduler_steps: int = 1,
                  multi_step_stream_outputs: bool = False,
                  send_delta_data: bool = False,
-                 policy: str = "fcfs") -> None:
+                 policy: str = "fcfs",
+                 use_padding_aware_scheduling=False) -> None:
         if max_num_batched_tokens is None:
             if enable_chunked_prefill:
                 if num_scheduler_steps > 1:
@@ -1018,6 +1025,7 @@ def __init__(self,
                 self.max_num_batched_tokens)
 
         self.max_num_seqs = max_num_seqs
+        self.max_num_prefill_seqs = max_num_prefill_seqs
         self.max_model_len = max_model_len
         self.use_v2_block_manager = use_v2_block_manager
         self.num_lookahead_slots = num_lookahead_slots
@@ -1029,6 +1037,7 @@ def __init__(self,
         self.multi_step_stream_outputs = multi_step_stream_outputs
         self.send_delta_data = send_delta_data
         self.policy = policy
+        self.use_padding_aware_scheduling = use_padding_aware_scheduling
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -1059,6 +1068,13 @@ def _verify_args(self) -> None:
                 "num_scheduler_steps "
                 f"({self.num_scheduler_steps}) must be greater than or "
                 "equal to 1.")
+        if self.max_num_prefill_seqs is not None \
+            and not self.use_padding_aware_scheduling:
+            raise ValueError("max_num_prefill_seqs can be only "
+                             "used with padding-aware-scheduling. ")
+        if self.use_padding_aware_scheduling and self.chunked_prefill_enabled:
+            raise ValueError("Padding-aware scheduling currently "
+                             "does not work with chunked prefill ")
 
         if (not self.use_v2_block_manager \
             and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1f0a121711db5..1c69c72933b79 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -11,6 +11,7 @@
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta,
@@ -101,6 +102,94 @@ def num_curr_seqs(self):
         return self._num_curr_seqs
 
 
+@dataclass
+class PaddingAwareSchedulingBudget(SchedulingBudget):
+    max_num_prefill_seqs: Optional[int] = None
+    _prefill_request_ids_max_seq_lens: Dict[str,
+                                            int] = field(default_factory=dict)
+    _max_seq_len: int = 0
+    _num_curr_prefill_seqs: int = 0
+
+    def _generic_padding_fn(self, batch_size, max_seq_len) -> int:
+        return batch_size * max_seq_len
+
+    def _hpu_padding_fn(self, batch_size, max_seq_len):
+        from vllm.worker.hpu_model_runner import (HPUBucketingGlobalState,
+                                                  find_bucket)
+        padded_bs = batch_size
+        padded_seq = max_seq_len
+
+        hpu_bucketing_global_state = HPUBucketingGlobalState()
+
+        bs_cfg = hpu_bucketing_global_state.prompt_bs_bucket_cfg
+        if bs_cfg is not None:
+            padded_bs = find_bucket(batch_size, bs_cfg)
+        else:
+            logger.warning(
+                "prompt_bs_bucket_cfg was not set! Using unpadded batch size.")
+        seq_cfg = hpu_bucketing_global_state.prompt_seq_bucket_cfg
+        if seq_cfg is not None:
+            padded_seq = find_bucket(max_seq_len, seq_cfg)
+        else:
+            logger.warning("prompt_seq_bucket_cfg was not set! "
+                           "Using unpadded sequence length.")
+        return padded_bs * padded_seq
+
+    def _padding_fn_selector(self):
+        if current_platform.is_hpu():
+            return self._hpu_padding_fn
+        return self._generic_padding_fn
+
+    def _maybe_update_max_seq_len(self,
+                                  new_seq_max_seq_len: Optional[int] = None):
+        if new_seq_max_seq_len is not None \
+            and new_seq_max_seq_len > self._max_seq_len:
+            self._max_seq_len = new_seq_max_seq_len
+            return
+        self._max_seq_len = max(
+            self._prefill_request_ids_max_seq_lens.values())
+
+    def add_prefill_seqs(self, req_id, num_curr_prefill_seqs, max_seq_len):
+        self._prefill_request_ids_max_seq_lens[req_id] = max_seq_len
+        self._num_curr_prefill_seqs += num_curr_prefill_seqs
+        self._maybe_update_max_seq_len(max_seq_len)
+
+    def subtract_prefill_seqs(self, req_id, num_curr_prefill_seqs):
+        if req_id in self._prefill_request_ids_max_seq_lens:
+            popped_seq_len = self._prefill_request_ids_max_seq_lens.pop(req_id)
+            self._num_curr_prefill_seqs -= num_curr_prefill_seqs
+            if popped_seq_len == self._max_seq_len:
+                self._maybe_update_max_seq_len()
+
+    def can_schedule(self,
+                     *args,
+                     num_new_tokens: int,
+                     num_new_seqs: int,
+                     is_prefill: bool = False,
+                     max_seq_len: int = 0):
+        can_parent_schedule = super().can_schedule(
+            *args, num_new_tokens=num_new_tokens, num_new_seqs=num_new_seqs)
+        if not can_parent_schedule or not is_prefill:
+            return can_parent_schedule
+        new_batch_size = self._num_curr_prefill_seqs + num_new_seqs
+        new_max_seq_len = max(max(self._max_seq_len, max_seq_len), 1)
+        padding_fn = self._padding_fn_selector()
+        num_new_padded_tokens = padding_fn(new_batch_size, new_max_seq_len)
+        result = num_new_padded_tokens <= self.token_budget
+        if self.max_num_prefill_seqs is not None and result:
+            result = self._num_curr_prefill_seqs + num_new_seqs \
+                <= self.max_num_prefill_seqs
+        return result
+
+    @property
+    def max_seq_len(self):
+        return self._max_seq_len
+
+    @property
+    def num_curr_prefill_seqs(self):
+        return self._num_curr_prefill_seqs
+
+
 @dataclass
 class ScheduledSequenceGroup:
     # A sequence group that's scheduled.
@@ -938,9 +1027,18 @@ def _schedule_prefills(
                     continue
 
             num_new_seqs = seq_group.get_max_num_running_seqs()
+            max_prefill_seq_len = None
+            can_schedule_kwargs = {
+                'num_new_tokens': num_new_tokens,
+                'num_new_seqs': num_new_seqs
+            }
+            if self.scheduler_config.use_padding_aware_scheduling:
+                max_prefill_seq_len = max(
+                    [seq.get_num_new_tokens() for seq in seq_group.get_seqs()])
+                can_schedule_kwargs['is_prefill'] = True
+                can_schedule_kwargs['max_seq_len'] = max_prefill_seq_len
             if (num_new_tokens == 0
-                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
-                                               num_new_seqs=num_new_seqs)):
+                    or not budget.can_schedule(**can_schedule_kwargs)):
                 break
 
             # Can schedule this request.
@@ -971,6 +1069,10 @@ def _schedule_prefills(
                                        token_chunk_size=num_new_tokens))
             budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+            if self.scheduler_config.use_padding_aware_scheduling:
+                assert isinstance(budget, PaddingAwareSchedulingBudget)
+                budget.add_prefill_seqs(seq_group.request_id, num_new_seqs,
+                                        max_prefill_seq_len)
 
         # Queue requests that couldn't be scheduled.
         waiting_queue.extendleft(leftover_waiting_sequences)
@@ -992,10 +1094,18 @@ def _schedule_default(self) -> SchedulerOutputs:
         be swapped or preempted.
         """
         # Include running requests to the budget.
-        budget = SchedulingBudget(
-            token_budget=self.scheduler_config.max_num_batched_tokens,
-            max_num_seqs=self.scheduler_config.max_num_seqs,
-        )
+        budget: SchedulingBudget
+        if self.scheduler_config.use_padding_aware_scheduling:
+            budget = PaddingAwareSchedulingBudget(
+                token_budget=self.scheduler_config.max_num_batched_tokens,
+                max_num_seqs=self.scheduler_config.max_num_seqs,
+                max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs
+            )
+        else:
+            budget = SchedulingBudget(
+                token_budget=self.scheduler_config.max_num_batched_tokens,
+                max_num_seqs=self.scheduler_config.max_num_seqs,
+            )
         # Make sure we include num running seqs before scheduling prefill,
         # so that we don't schedule beyond max_num_seqs for prefill.
         for seq_group in self.running:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3c9f3d4fe4ab3..cdf1401816800 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -113,11 +113,13 @@ class EngineArgs:
     enable_prefix_caching: bool = False
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
+    use_padding_aware_scheduling: bool = False
     swap_space: float = 4  # GiB
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
     max_num_seqs: int = 256
+    max_num_prefill_seqs: Optional[int] = None
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
@@ -391,6 +393,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action='store_true',
             help='Use BlockSpaceMangerV2. By default this is set to True. '
             'Set to False to use BlockSpaceManagerV1')
+        parser.add_argument(
+            '--use-padding-aware-scheduling',
+            default=EngineArgs.use_padding_aware_scheduling,
+            action='store_true',
+            help=('Use padding-aware scheduling. If True, the scheduler '
+                  'will consider padded tokens in prefill. '
+                  'By default this is set to False. '))
         parser.add_argument(
             '--num-lookahead-slots',
             type=int,
@@ -445,6 +454,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.max_num_seqs,
                             help='Maximum number of sequences per iteration.')
+        parser.add_argument(
+            '--max-num-prefill-seqs',
+            type=int,
+            default=EngineArgs.max_num_prefill_seqs,
+            help=('Maximum number of prefill sequences per '
+                  'iteration. Can be used only with padding-aware '
+                  'scheduling. Must be <= max_num_seqs.'))
         parser.add_argument(
             '--max-logprobs',
             type=int,
@@ -1036,6 +1052,7 @@ def create_engine_config(self) -> EngineConfig:
         scheduler_config = SchedulerConfig(
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
+            max_num_prefill_seqs=self.max_num_prefill_seqs,
             max_model_len=model_config.max_model_len,
             use_v2_block_manager=self.use_v2_block_manager,
             num_lookahead_slots=num_lookahead_slots,
@@ -1049,7 +1066,7 @@ def create_engine_config(self) -> EngineConfig:
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,
-        )
+            use_padding_aware_scheduling=self.use_padding_aware_scheduling)
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,
             max_loras=self.max_loras,
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d8150a56844a2..785337478468f 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -13,6 +13,7 @@
 import os
 import time
 from array import array
+from dataclasses import dataclass, field
 from enum import IntEnum
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
                     Optional, Set, Tuple, Type, TypeVar, Union)
@@ -64,6 +65,26 @@
 LORA_WARMUP_RANK = 8
 
 
+class Singleton(type):
+    _instances: Dict[type, object] = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton,
+                                        cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+@dataclass
+class HPUBucketingGlobalState(metaclass=Singleton):
+    prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False)
+    prompt_buckets: List[Tuple[int, int]] = field(init=False)
+    decode_buckets: List[Tuple[int, int]] = field(init=False)
+
+
 def subtuple(obj: object,
              typename: str,
              to_copy: List[str],
@@ -542,6 +563,9 @@ def __init__(
         self.device = self.device_config.device
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
+        self.max_num_prefill_seqs = self.scheduler_config.max_num_prefill_seqs \
+            if self.scheduler_config.max_num_prefill_seqs is not None \
+                else self.max_num_seqs
         self.max_model_len = self.scheduler_config.max_model_len
         self.max_num_batched_tokens = \
             self.scheduler_config.max_num_batched_tokens
@@ -569,6 +593,7 @@ def __init__(
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
         self.seen_configs: set = set()
         self._mem_margin: Optional[int] = None
+        self.bucketing_global_state = HPUBucketingGlobalState()
         self._setup_buckets()
         self._set_gc_threshold()
 
@@ -680,27 +705,26 @@ def _is_valid_bucket(self, bucket):
 
     def _setup_buckets(self) -> None:
         align_bs = lambda x: min(self.max_num_seqs, x)
-        max_bucket_cfg = 64
         #FIXME: The default values should be max_model_len
         max_prompt_seq = 1024
         max_decode_seq = 2048
-        self.prompt_bs_bucket_cfg = read_bucket_settings(
+        self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings(
             'prompt',
             'bs',
             min=1,
             step=align_bs(32),
-            max=align_bs(max_bucket_cfg))
-        self.decode_bs_bucket_cfg = read_bucket_settings('decode',
-                                                         'bs',
-                                                         min=1,
-                                                         step=align_bs(32),
-                                                         max=self.max_num_seqs)
-        self.prompt_seq_bucket_cfg = read_bucket_settings('prompt',
-                                                          'seq',
-                                                          min=self.block_size,
-                                                          step=self.block_size,
-                                                          max=max_prompt_seq)
-        self.decode_block_bucket_cfg = read_bucket_settings(
+            max=self.max_num_prefill_seqs)
+        self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings(
+            'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs)
+        self.bucketing_global_state.prompt_seq_bucket_cfg = \
+            read_bucket_settings(
+            'prompt',
+            'seq',
+            min=self.block_size,
+            step=self.block_size,
+            max=max_prompt_seq)
+        self.bucketing_global_state.decode_block_bucket_cfg = \
+            read_bucket_settings(
             'decode',
             'block',
             min=self.block_size,
@@ -710,13 +734,13 @@ def _setup_buckets(self) -> None:
         self.graphed_buckets: Set[Any] = set()
 
         msg = ("Prompt bucket config (min, step, max_warmup) "
-               f"bs:{self.prompt_bs_bucket_cfg}, "
-               f"seq:{self.prompt_seq_bucket_cfg}")
+               f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, "
+               f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}")
         logger.info(msg)
 
         msg = ("Decode bucket config (min, step, max_warmup) "
-               f"bs:{self.decode_bs_bucket_cfg}, "
-               f"block:{self.decode_block_bucket_cfg}")
+               f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, "
+               f"block:{self.bucketing_global_state.decode_block_bucket_cfg}")
         logger.info(msg)
 
     def _prepare_prompt(
@@ -834,7 +858,8 @@ def _prepare_prompt(
         assert max_query_len > 0
 
         max_prompt_len = max(
-            find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg),
+            find_bucket(max(seq_lens),
+                        self.bucketing_global_state.prompt_seq_bucket_cfg),
             self.block_size)
 
         lora_ids: List[int] = []
@@ -1001,8 +1026,9 @@ def _prepare_decode(
                        for b_u, lb in zip(blocks_used, last_block)]
         block_usage = list(itertools.chain(*block_usage))
 
-        block_bucket_size = find_bucket(len(block_list),
-                                        self.decode_block_bucket_cfg)
+        block_bucket_size = find_bucket(
+            len(block_list),
+            self.bucketing_global_state.decode_block_bucket_cfg)
         block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
         block_mapping = pad_list(block_mapping, block_bucket_size, -1)
         block_usage = pad_list(block_usage, block_bucket_size, 1)
@@ -1076,8 +1102,8 @@ def prepare_input_tensors(
         self.profiler.start('internal', base_event_name)
 
         real_batch_size = len(seq_group_metadata_list)
-        bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \
-            self.decode_bs_bucket_cfg
+        bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \
+            if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg
         batch_size_padded = find_bucket(real_batch_size, bucket_cfg)
         batch_size_padding = batch_size_padded - real_batch_size
         seq_group_metadata_list = seq_group_metadata_list.copy()
@@ -1282,9 +1308,10 @@ def create_dummy_seq_group_metadata(self,
     def profile_run(self) -> None:
         num_layers = self.model_config.get_num_layers(self.parallel_config)
         kv_caches = [None] * num_layers
-        max_batch_size = self.prompt_bs_bucket_cfg[-1]
-        max_seq_len = min(self.prompt_seq_bucket_cfg[-1],
-                          self.max_num_batched_tokens // max_batch_size)
+        max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1]
+        max_seq_len = min(
+            self.bucketing_global_state.prompt_seq_bucket_cfg[-1],
+            self.max_num_batched_tokens // max_batch_size)
 
         self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
                              False, True)
@@ -1498,13 +1525,15 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.profiler.start('internal', 'warmup')
         max_blocks = kv_caches[0][0].size(0)
 
-        self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets(
-            self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg,
+        self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \
+            generate_prompt_buckets(
+            self.bucketing_global_state.prompt_bs_bucket_cfg,
+            self.bucketing_global_state.prompt_seq_bucket_cfg,
             self.max_num_batched_tokens)
 
-        msg = (
-            f"Generated {len(self.prompt_buckets)} "
-            f"prompt buckets [bs, seq]: {list(sorted(self.prompt_buckets))}")
+        msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} "
+               f"prompt buckets [bs, seq]: \
+                {list(sorted(self.bucketing_global_state.prompt_buckets))}")
         logger.info(msg)
 
         msg = (f"Omitted {len(prompt_omitted_buckets)} "
@@ -1515,16 +1544,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}"
         logger.debug(msg)
 
-        self.decode_buckets = generate_decode_buckets(
-            self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg,
-            max_blocks)
+        self.bucketing_global_state.decode_buckets = generate_decode_buckets(
+            self.bucketing_global_state.decode_bs_bucket_cfg,
+            self.bucketing_global_state.decode_block_bucket_cfg, max_blocks)
         logger.info("Generated %d decode buckets [bs, total_blocks]: %s",
-                    len(self.decode_buckets),
-                    list(sorted(self.decode_buckets)))
+                    len(self.bucketing_global_state.decode_buckets),
+                    list(sorted(self.bucketing_global_state.decode_buckets)))
 
         if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
-            cache_size_limit = len(self.prompt_buckets) + len(
-                self.decode_buckets) + 1
+            cache_size_limit = len(
+                self.bucketing_global_state.prompt_buckets) + len(
+                    self.bucketing_global_state.decode_buckets) + 1
             torch._dynamo.config.cache_size_limit = max(
                 cache_size_limit, torch._dynamo.config.cache_size_limit)
             # Multiply by 8 to follow the original default ratio between
@@ -1551,8 +1581,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                            'Please update Gaudi Software Suite.')
         with compile_only_mode_context(
         ) if can_use_compile_only_mode else contextlib.nullcontext():
-            self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
-            self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
+            self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets,
+                                    True, kv_caches)
+            self.warmup_all_buckets(self.bucketing_global_state.decode_buckets,
+                                    False, kv_caches)
 
             if not self.enforce_eager and htorch.utils.internal.is_lazy():
                 assert self.mem_margin is not None, \
@@ -1582,12 +1614,12 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                                                  'max_bs')
                 mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
                     self.warmup_graphs(
-                    prompt_strategy, self.prompt_buckets, True, kv_caches,
-                    prompt_available_memory)
+                    prompt_strategy, self.bucketing_global_state.prompt_buckets,
+                    True, kv_caches, prompt_available_memory)
                 mem_post_decode, decode_batch_seq, decode_captured_all = \
                     self.warmup_graphs(
-                    decode_strategy, self.decode_buckets, False, kv_caches,
-                    decode_available_memory)
+                    decode_strategy, self.bucketing_global_state.decode_buckets,
+                    False, kv_caches, decode_available_memory)
 
                 # Not all prompt buckets were captured, but all decode buckets
                 # were captured and we have some free graph-allocated space
@@ -1596,7 +1628,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                         and not prompt_captured_all and decode_captured_all):
                     mem_post_prompt, _, prompt_captured_all = (
                         self.warmup_graphs(
-                            prompt_strategy, self.prompt_buckets, True,
+                            prompt_strategy,
+                            self.bucketing_global_state.prompt_buckets, True,
                             kv_caches,
                             graph_free_mem - mem_post_prompt - mem_post_decode,
                             mem_post_prompt, prompt_batch_seq))
@@ -1608,14 +1641,18 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     and not decode_captured_all \
                         and prompt_captured_all:
                     mem_post_decode, _, _ = self.warmup_graphs(
-                        decode_strategy, self.decode_buckets, False, kv_caches,
+                        decode_strategy,
+                        self.bucketing_global_state.decode_buckets, False,
+                        kv_caches,
                         graph_free_mem - mem_post_prompt - mem_post_decode,
                         mem_post_decode, decode_batch_seq)
 
-                self.log_graph_warmup_summary(self.prompt_buckets, True,
-                                              mem_post_prompt)
-                self.log_graph_warmup_summary(self.decode_buckets, False,
-                                              mem_post_decode)
+                self.log_graph_warmup_summary(
+                    self.bucketing_global_state.prompt_buckets, True,
+                    mem_post_prompt)
+                self.log_graph_warmup_summary(
+                    self.bucketing_global_state.decode_buckets, False,
+                    mem_post_decode)
 
         end_time = time.perf_counter()
         end_mem = HabanaMemoryProfiler.current_device_memory_usage()

From c11f23ab5dcfc2180039bd336db0fc38635303ad Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 15:54:38 +0300
Subject: [PATCH 336/341] Add forward_hpu to RotaryEmbedding, remove custom
 module

---
 tests/kernels/test_pos_encoding.py            |  9 +-
 .../model_executor/layers/rotary_embedding.py | 94 +++++++++++++------
 2 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index ba9d2d4389b21..a9cb17b6fd6b3 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.platforms import current_platform
 from vllm.utils import seed_everything
 
 from .allclose_default import get_default_atol, get_default_rtol
@@ -20,7 +21,9 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-
+if current_platform.is_hpu():
+    import habana_frameworks.torch as htorch 
+    CUDA_DEVICES = ['hpu']
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
@@ -65,6 +68,8 @@ def test_rotary_embedding(
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key)
+    if current_platform.is_hpu():
+        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
     torch.testing.assert_close(out_query,
@@ -193,6 +198,8 @@ def test_batched_rotary_embedding_multi_lora(
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key,
                                              query_offsets)
+    if current_platform.is_hpu():
+        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions, query, key,
                                       query_offsets.flatten())
     # Compare the results.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 85cd700c978ea..10626d53338e3 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -28,7 +28,6 @@
 import torch.nn as nn
 
 from vllm.model_executor.custom_op import CustomOp
-from vllm.platforms import current_platform
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -195,6 +194,61 @@ def forward_xpu(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_hpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from habana_frameworks.torch.hpex.kernels import (
+            RotaryPosEmbeddingMode, apply_rotary_pos_emb)
+        positions = positions.flatten()
+        if offsets is not None:
+            positions = positions + offsets
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions).view(
+            num_tokens, 1, -1)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
+        # to query hidden dimension, so the original tensors need to be
+        # expanded
+        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
+        # and expansion of cos/sin tensors via concatenation
+        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
+        # and expansion of cos/sin tensors via repeat_interleave
+        rope_mode: RotaryPosEmbeddingMode
+        if self.is_neox_style:
+            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
+            cos = torch.cat((cos, cos), dim=-1)
+            sin = torch.cat((sin, sin), dim=-1)
+        else:
+            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+            sin = torch.repeat_interleave(sin,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+            cos = torch.repeat_interleave(cos,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
+                                         rope_mode)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
@@ -918,17 +972,8 @@ def get_rope(
         return _ROPE_DICT[key]
 
     if rope_scaling is None:
-        if current_platform.is_hpu():
-            from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
-            rotary_emb = HpuRotaryEmbedding(head_size,
-                                            rotary_dim,
-                                            max_position,
-                                            base,
-                                            is_neox_style,
-                                            RoPEFallback=RotaryEmbedding)
-        else:
-            rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position,
-                                         base, is_neox_style, dtype)
+        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                     is_neox_style, dtype)
     else:
         scaling_type = rope_scaling[
             "type"] if "type" in rope_scaling else rope_scaling["rope_type"]
@@ -941,25 +986,12 @@ def get_rope(
             high_freq_factor = rope_scaling["high_freq_factor"]
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
-            if current_platform.is_hpu():
-                from vllm_hpu_extension.rotary_embed import (
-                    HpuLlama3RotaryEmbedding)
-                rotary_emb = HpuLlama3RotaryEmbedding(
-                    head_size,
-                    rotary_dim,
-                    max_position,
-                    base,
-                    is_neox_style,
-                    scaling_factor,
-                    low_freq_factor,
-                    high_freq_factor,
-                    original_max_position,
-                    RoPEFallback=Llama3RotaryEmbedding)
-            else:
-                rotary_emb = Llama3RotaryEmbedding(
-                    head_size, rotary_dim, max_position, base, is_neox_style,
-                    dtype, scaling_factor, low_freq_factor, high_freq_factor,
-                    original_max_position)
+            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
+                                               max_position, base,
+                                               is_neox_style, dtype,
+                                               scaling_factor, low_freq_factor,
+                                               high_freq_factor,
+                                               original_max_position)
         elif scaling_type == "linear":
             rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
                                                       max_position, base,

From 78a816cbe395fa5e1303ce4d1ed5d975d3b4ec67 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 15:57:54 +0300
Subject: [PATCH 337/341] add missing mark step in test

---
 tests/kernels/test_pos_encoding.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index a9cb17b6fd6b3..6ca3a645c7771 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -22,9 +22,10 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
 if current_platform.is_hpu():
-    import habana_frameworks.torch as htorch 
+    import habana_frameworks.torch as htorch
     CUDA_DEVICES = ['hpu']
 
+
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
@@ -125,6 +126,8 @@ def test_batched_rotary_embedding(
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key)
+    if current_platform.is_hpu():
+        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions,
                                       query,
                                       key,

From 5bc39853a066541248f0ab883c926c5f5d7b1bdd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 16:59:19 +0300
Subject: [PATCH 338/341] cleanup

---
 tests/kernels/test_pos_encoding.py            |  10 --
 vllm/config.py                                |  18 +--
 vllm/core/scheduler.py                        | 129 ++----------------
 vllm/engine/arg_utils.py                      |  20 +--
 .../model_executor/layers/rotary_embedding.py |  55 ++++++++
 5 files changed, 67 insertions(+), 165 deletions(-)

diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 8479361a47346..94da00915d40e 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -5,7 +5,6 @@
 import torch
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.platforms import current_platform
 from vllm.utils import seed_everything
 
 from .allclose_default import get_default_atol, get_default_rtol
@@ -21,9 +20,6 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-if current_platform.is_hpu():
-    import habana_frameworks.torch as htorch
-    CUDA_DEVICES = ['hpu']
 
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -69,8 +65,6 @@ def test_rotary_embedding(
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key)
-    if current_platform.is_hpu():
-        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
     torch.testing.assert_close(out_query,
@@ -126,8 +120,6 @@ def test_batched_rotary_embedding(
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key)
-    if current_platform.is_hpu():
-        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions,
                                       query,
                                       key,
@@ -201,8 +193,6 @@ def test_batched_rotary_embedding_multi_lora(
     # because the custom kernel is in-place.
     ref_query, ref_key = rope.forward_native(positions, query, key,
                                              query_offsets)
-    if current_platform.is_hpu():
-        htorch.core.mark_step()
     out_query, out_key = rope.forward(positions, query, key,
                                       query_offsets.flatten())
     # Compare the results.
diff --git a/vllm/config.py b/vllm/config.py
index f2f0e089beb64..0de33f3ec7b2e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -952,9 +952,6 @@ class SchedulerConfig:
             a single iteration.
         max_num_seqs: Maximum number of sequences to be processed in a single
             iteration.
-        max_num_prefill_seqs: Maximum number of prefill sequences to be
-             processed in a single iteration. Used only with padding-aware 
-             scheduling.
         max_model_len: Maximum length of a sequence (including prompt
             and generated text).
         use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
@@ -978,14 +975,11 @@ class SchedulerConfig:
             when SPMD worker architecture is enabled. I.e.,
             VLLM_USE_RAY_SPMD_WORKER=1
         policy: The scheduling policy to use. "fcfs" (default) or "priority".
-        use_padding_aware_scheduling: If True, scheduler will consider padded
-            tokens in prefill.
     """
 
     def __init__(self,
                  max_num_batched_tokens: Optional[int],
                  max_num_seqs: int,
-                 max_num_prefill_seqs: Optional[int],
                  max_model_len: int,
                  use_v2_block_manager: bool = True,
                  num_lookahead_slots: int = 0,
@@ -997,8 +991,7 @@ def __init__(self,
                  num_scheduler_steps: int = 1,
                  multi_step_stream_outputs: bool = False,
                  send_delta_data: bool = False,
-                 policy: str = "fcfs",
-                 use_padding_aware_scheduling=False) -> None:
+                 policy: str = "fcfs") -> None:
         if max_num_batched_tokens is None:
             if enable_chunked_prefill:
                 if num_scheduler_steps > 1:
@@ -1037,7 +1030,6 @@ def __init__(self,
                 self.max_num_batched_tokens)
 
         self.max_num_seqs = max_num_seqs
-        self.max_num_prefill_seqs = max_num_prefill_seqs
         self.max_model_len = max_model_len
         self.use_v2_block_manager = use_v2_block_manager
         self.num_lookahead_slots = num_lookahead_slots
@@ -1049,7 +1041,6 @@ def __init__(self,
         self.multi_step_stream_outputs = multi_step_stream_outputs
         self.send_delta_data = send_delta_data
         self.policy = policy
-        self.use_padding_aware_scheduling = use_padding_aware_scheduling
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -1080,13 +1071,6 @@ def _verify_args(self) -> None:
                 "num_scheduler_steps "
                 f"({self.num_scheduler_steps}) must be greater than or "
                 "equal to 1.")
-        if self.max_num_prefill_seqs is not None \
-            and not self.use_padding_aware_scheduling:
-            raise ValueError("max_num_prefill_seqs can be only "
-                             "used with padding-aware-scheduling. ")
-        if self.use_padding_aware_scheduling and self.chunked_prefill_enabled:
-            raise ValueError("Padding-aware scheduling currently "
-                             "does not work with chunked prefill ")
 
         if (not self.use_v2_block_manager \
             and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1):
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1c69c72933b79..e7eaaf12272d6 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -4,14 +4,14 @@
 import time
 from collections import deque
 from dataclasses import dataclass, field
-from typing import (Callable, Deque, Dict, Iterable, List, Optional, Set,
-                    Tuple, Union)
+from typing import Callable, Deque, Dict, Iterable, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta,
@@ -102,94 +102,6 @@ def num_curr_seqs(self):
         return self._num_curr_seqs
 
 
-@dataclass
-class PaddingAwareSchedulingBudget(SchedulingBudget):
-    max_num_prefill_seqs: Optional[int] = None
-    _prefill_request_ids_max_seq_lens: Dict[str,
-                                            int] = field(default_factory=dict)
-    _max_seq_len: int = 0
-    _num_curr_prefill_seqs: int = 0
-
-    def _generic_padding_fn(self, batch_size, max_seq_len) -> int:
-        return batch_size * max_seq_len
-
-    def _hpu_padding_fn(self, batch_size, max_seq_len):
-        from vllm.worker.hpu_model_runner import (HPUBucketingGlobalState,
-                                                  find_bucket)
-        padded_bs = batch_size
-        padded_seq = max_seq_len
-
-        hpu_bucketing_global_state = HPUBucketingGlobalState()
-
-        bs_cfg = hpu_bucketing_global_state.prompt_bs_bucket_cfg
-        if bs_cfg is not None:
-            padded_bs = find_bucket(batch_size, bs_cfg)
-        else:
-            logger.warning(
-                "prompt_bs_bucket_cfg was not set! Using unpadded batch size.")
-        seq_cfg = hpu_bucketing_global_state.prompt_seq_bucket_cfg
-        if seq_cfg is not None:
-            padded_seq = find_bucket(max_seq_len, seq_cfg)
-        else:
-            logger.warning("prompt_seq_bucket_cfg was not set! "
-                           "Using unpadded sequence length.")
-        return padded_bs * padded_seq
-
-    def _padding_fn_selector(self):
-        if current_platform.is_hpu():
-            return self._hpu_padding_fn
-        return self._generic_padding_fn
-
-    def _maybe_update_max_seq_len(self,
-                                  new_seq_max_seq_len: Optional[int] = None):
-        if new_seq_max_seq_len is not None \
-            and new_seq_max_seq_len > self._max_seq_len:
-            self._max_seq_len = new_seq_max_seq_len
-            return
-        self._max_seq_len = max(
-            self._prefill_request_ids_max_seq_lens.values())
-
-    def add_prefill_seqs(self, req_id, num_curr_prefill_seqs, max_seq_len):
-        self._prefill_request_ids_max_seq_lens[req_id] = max_seq_len
-        self._num_curr_prefill_seqs += num_curr_prefill_seqs
-        self._maybe_update_max_seq_len(max_seq_len)
-
-    def subtract_prefill_seqs(self, req_id, num_curr_prefill_seqs):
-        if req_id in self._prefill_request_ids_max_seq_lens:
-            popped_seq_len = self._prefill_request_ids_max_seq_lens.pop(req_id)
-            self._num_curr_prefill_seqs -= num_curr_prefill_seqs
-            if popped_seq_len == self._max_seq_len:
-                self._maybe_update_max_seq_len()
-
-    def can_schedule(self,
-                     *args,
-                     num_new_tokens: int,
-                     num_new_seqs: int,
-                     is_prefill: bool = False,
-                     max_seq_len: int = 0):
-        can_parent_schedule = super().can_schedule(
-            *args, num_new_tokens=num_new_tokens, num_new_seqs=num_new_seqs)
-        if not can_parent_schedule or not is_prefill:
-            return can_parent_schedule
-        new_batch_size = self._num_curr_prefill_seqs + num_new_seqs
-        new_max_seq_len = max(max(self._max_seq_len, max_seq_len), 1)
-        padding_fn = self._padding_fn_selector()
-        num_new_padded_tokens = padding_fn(new_batch_size, new_max_seq_len)
-        result = num_new_padded_tokens <= self.token_budget
-        if self.max_num_prefill_seqs is not None and result:
-            result = self._num_curr_prefill_seqs + num_new_seqs \
-                <= self.max_num_prefill_seqs
-        return result
-
-    @property
-    def max_seq_len(self):
-        return self._max_seq_len
-
-    @property
-    def num_curr_prefill_seqs(self):
-        return self._num_curr_prefill_seqs
-
-
 @dataclass
 class ScheduledSequenceGroup:
     # A sequence group that's scheduled.
@@ -204,7 +116,7 @@ class ScheduledSequenceGroup:
 class SchedulerOutputs:
     """The scheduling decision made from a scheduler."""
     # Scheduled sequence groups.
-    scheduled_seq_groups: Iterable[ScheduledSequenceGroup]
+    scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
     # Number of prefill groups scheduled.
     num_prefill_groups: int
     # Total number of batched tokens.
@@ -1027,18 +939,9 @@ def _schedule_prefills(
                     continue
 
             num_new_seqs = seq_group.get_max_num_running_seqs()
-            max_prefill_seq_len = None
-            can_schedule_kwargs = {
-                'num_new_tokens': num_new_tokens,
-                'num_new_seqs': num_new_seqs
-            }
-            if self.scheduler_config.use_padding_aware_scheduling:
-                max_prefill_seq_len = max(
-                    [seq.get_num_new_tokens() for seq in seq_group.get_seqs()])
-                can_schedule_kwargs['is_prefill'] = True
-                can_schedule_kwargs['max_seq_len'] = max_prefill_seq_len
             if (num_new_tokens == 0
-                    or not budget.can_schedule(**can_schedule_kwargs)):
+                    or not budget.can_schedule(num_new_tokens=num_new_tokens,
+                                               num_new_seqs=num_new_seqs)):
                 break
 
             # Can schedule this request.
@@ -1069,10 +972,6 @@ def _schedule_prefills(
                                        token_chunk_size=num_new_tokens))
             budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
             budget.add_num_seqs(seq_group.request_id, num_new_seqs)
-            if self.scheduler_config.use_padding_aware_scheduling:
-                assert isinstance(budget, PaddingAwareSchedulingBudget)
-                budget.add_prefill_seqs(seq_group.request_id, num_new_seqs,
-                                        max_prefill_seq_len)
 
         # Queue requests that couldn't be scheduled.
         waiting_queue.extendleft(leftover_waiting_sequences)
@@ -1094,18 +993,10 @@ def _schedule_default(self) -> SchedulerOutputs:
         be swapped or preempted.
         """
         # Include running requests to the budget.
-        budget: SchedulingBudget
-        if self.scheduler_config.use_padding_aware_scheduling:
-            budget = PaddingAwareSchedulingBudget(
-                token_budget=self.scheduler_config.max_num_batched_tokens,
-                max_num_seqs=self.scheduler_config.max_num_seqs,
-                max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs
-            )
-        else:
-            budget = SchedulingBudget(
-                token_budget=self.scheduler_config.max_num_batched_tokens,
-                max_num_seqs=self.scheduler_config.max_num_seqs,
-            )
+        budget = SchedulingBudget(
+            token_budget=self.scheduler_config.max_num_batched_tokens,
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+        )
         # Make sure we include num running seqs before scheduling prefill,
         # so that we don't schedule beyond max_num_seqs for prefill.
         for seq_group in self.running:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 96aef9ff49946..bc97bb00e8525 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -112,13 +112,11 @@ class EngineArgs:
     enable_prefix_caching: bool = False
     disable_sliding_window: bool = False
     use_v2_block_manager: bool = True
-    use_padding_aware_scheduling: bool = False
     swap_space: float = 4  # GiB
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
     max_num_seqs: int = 256
-    max_num_prefill_seqs: Optional[int] = None
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
     revision: Optional[str] = None
@@ -385,13 +383,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action='store_true',
             help='Use BlockSpaceMangerV2. By default this is set to True. '
             'Set to False to use BlockSpaceManagerV1')
-        parser.add_argument(
-            '--use-padding-aware-scheduling',
-            default=EngineArgs.use_padding_aware_scheduling,
-            action='store_true',
-            help=('Use padding-aware scheduling. If True, the scheduler '
-                  'will consider padded tokens in prefill. '
-                  'By default this is set to False. '))
         parser.add_argument(
             '--num-lookahead-slots',
             type=int,
@@ -446,13 +437,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.max_num_seqs,
                             help='Maximum number of sequences per iteration.')
-        parser.add_argument(
-            '--max-num-prefill-seqs',
-            type=int,
-            default=EngineArgs.max_num_prefill_seqs,
-            help=('Maximum number of prefill sequences per '
-                  'iteration. Can be used only with padding-aware '
-                  'scheduling. Must be <= max_num_seqs.'))
         parser.add_argument(
             '--max-logprobs',
             type=int,
@@ -1042,7 +1026,6 @@ def create_engine_config(self) -> EngineConfig:
         scheduler_config = SchedulerConfig(
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
-            max_num_prefill_seqs=self.max_num_prefill_seqs,
             max_model_len=model_config.max_model_len,
             use_v2_block_manager=self.use_v2_block_manager,
             num_lookahead_slots=num_lookahead_slots,
@@ -1055,8 +1038,7 @@ def create_engine_config(self) -> EngineConfig:
             multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
-            policy=self.scheduling_policy,
-            use_padding_aware_scheduling=self.use_padding_aware_scheduling)
+            policy=self.scheduling_policy)
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,
             max_loras=self.max_loras,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 2ed44e2093bbe..ca790d966c6a0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -194,6 +194,61 @@ def forward_xpu(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
+    def forward_hpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from habana_frameworks.torch.hpex.kernels import (
+            RotaryPosEmbeddingMode, apply_rotary_pos_emb)
+        positions = positions.flatten()
+        if offsets is not None:
+            positions = positions + offsets
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions).view(
+            num_tokens, 1, -1)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
+        # to query hidden dimension, so the original tensors need to be
+        # expanded
+        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
+        # and expansion of cos/sin tensors via concatenation
+        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
+        # and expansion of cos/sin tensors via repeat_interleave
+        rope_mode: RotaryPosEmbeddingMode
+        if self.is_neox_style:
+            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
+            cos = torch.cat((cos, cos), dim=-1)
+            sin = torch.cat((sin, sin), dim=-1)
+        else:
+            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
+            sin = torch.repeat_interleave(sin,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+            cos = torch.repeat_interleave(cos,
+                                          2,
+                                          dim=-1,
+                                          output_size=cos_sin.shape[-1])
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
+                                         rope_mode)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"

From 14f8af43b979e767359e61575153ec013bf4bbb8 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 17:03:35 +0300
Subject: [PATCH 339/341] padding-aware scheduler cleanup

---
 vllm/worker/hpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 0255f792db356..ed71ba8f853a7 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -549,9 +549,9 @@ def __init__(
         self.device = self.device_config.device
         self.enforce_eager = self.model_config.enforce_eager
         self.max_num_seqs = self.scheduler_config.max_num_seqs
-        self.max_num_prefill_seqs = self.scheduler_config.max_num_prefill_seqs \
-            if self.scheduler_config.max_num_prefill_seqs is not None \
-                else self.max_num_seqs
+        # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs
+        # once padding-aware scheduling gets merged
+        self.max_num_prefill_seqs = 64
         self.max_model_len = self.scheduler_config.max_model_len
         self.max_num_batched_tokens = \
             self.scheduler_config.max_num_batched_tokens

From 65e34f63e5c8d49e470be5241f83d761e1c4530b Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 17:23:57 +0300
Subject: [PATCH 340/341] fix sentinel usage in model runner base

---
 vllm/worker/model_runner_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 434250ce65348..84c1c8ed42a90 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -53,7 +53,7 @@ def _init_attn_metadata_from_tensor_dict(
         # broadcasted properly.
         sentinel = object()
         val = tensor_dict.pop(field.name, sentinel)
-        if val == sentinel:
+        if val != sentinel:
             valid_attn_kwargs[field.name] = val
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata

From 4757350a14a88822e00d37e1769f9da504fccc59 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 17 Oct 2024 17:27:22 +0300
Subject: [PATCH 341/341] doc fixes

---
 .../getting_started/gaudi-installation.rst    | 23 ++++++++-----------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
index cb8c36e783585..68c1a56660fa4 100644
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -1,5 +1,5 @@
-vLLM with Intel® Gaudi® AI Accelerators
-=========================================
+Installation with Intel® Gaudi® AI Accelerators
+===============================================
 
 This README provides instructions on running vLLM with Intel Gaudi devices.
 
@@ -22,7 +22,7 @@ Requirements
 
 
 Quick start using Dockerfile
-============================
+----------------------------
 .. code:: console
 
    $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
@@ -30,14 +30,14 @@ Quick start using Dockerfile
 
 
 .. tip::
-   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html`__. Make sure you have ``habana-container-runtime`` package installed and that ```habana`` container runtime is registered.
+   If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation <https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html>`__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
 
 
 Build from source
-=================
+-----------------
 
 Environment verification
-------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~
 
 To verify that the Intel Gaudi software was correctly installed, run:
 
@@ -53,7 +53,7 @@ Verification <https://docs.habana.ai/en/latest/Installation_Guide/SW_Verificatio
 for more details.
 
 Run Docker Image
-----------------
+~~~~~~~~~~~~~~~~
 
 It is highly recommended to use the latest Docker image from Intel Gaudi
 vault. Refer to the `Intel Gaudi
@@ -68,7 +68,7 @@ Use the following commands to run a Docker image:
    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 
 Build and Install vLLM
----------------------------
+~~~~~~~~~~~~~~~~~~~~~~
 
 To build and install vLLM from source, run:
 
@@ -105,14 +105,13 @@ Supported Features
 -  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
 -  Attention with Linear Biases (ALiBi)
--  INC quantization
 
 Unsupported Features
 ====================
 
 -  Beam search
 -  LoRA adapters
--  AWQ quantization
+-  Quantization
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
@@ -151,10 +150,6 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
 -  `meta-llama/Meta-Llama-3.1-70B-Instruct <https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct>`__
    with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
--  `mistralai/Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__
-   on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
--  `mistralai/Mixtral-8x7B-Instruct-v0.1 <https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1>`__
-   with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling
 
 Performance Tuning
 ==================