diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index d263839705690..646d6ce6b9c9e 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -159,6 +159,10 @@ def _cached_get_attn_backend( logger.info("Using HPUAttention backend.") from vllm.attention.backends.hpu_attn import HPUAttentionBackend return HPUAttentionBackend + elif backend == _Backend.HPU_ATTN_V1: + logger.info("Using HPUAttentionV1 backend.") + from vllm.v1.attention.backends.hpu_attn import HPUAttentionBackendV1 + return HPUAttentionBackendV1 elif backend == _Backend.PALLAS: logger.info("Using Pallas backend.") from vllm.attention.backends.pallas import PallasAttentionBackend diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index f002a8ff905b1..60201d3398cf6 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -11,6 +11,7 @@ import vllm.envs as envs from vllm.config import CompilationConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import weak_ref_tensors from .counter import compilation_counter @@ -230,7 +231,7 @@ def __init__( compilation_configs: CompilationConfig, ): global global_graph_pool - if global_graph_pool is None: + if global_graph_pool is None and current_platform.is_cuda_alike(): global_graph_pool = torch.cuda.graph_pool_handle() # TODO: in the future, if we want to use multiple diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 34815d7f0aa78..7f1586e36fb97 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -959,6 +959,7 @@ def init_distributed_environment( "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s", world_size, rank, local_rank, distributed_init_method, backend) + print(distributed_init_method) if not torch.distributed.is_initialized(): assert distributed_init_method is not None, ( "distributed_init_method must be provided when initializing " diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index ee83187fff797..c86b23bd76279 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -2,6 +2,8 @@ import torch +from vllm import envs + from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: @@ -36,12 +38,21 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto": if scheduler_config.is_multi_step: + if envs.VLLM_USE_V1: + raise NotImplementedError parallel_config.worker_cls = \ "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker" elif vllm_config.speculative_config: + if envs.VLLM_USE_V1: + raise NotImplementedError parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" parallel_config.sd_worker_cls = \ "vllm.worker.hpu_worker.HPUWorker" else: - parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + if envs.VLLM_USE_V1: + parallel_config.worker_cls = \ + "vllm.v1.worker.hpu_worker.HPUWorker" + else: + parallel_config.worker_cls = \ + "vllm.worker.hpu_worker.HPUWorker" diff --git a/vllm/v1/attention/backends/hpu_attn.py b/vllm/v1/attention/backends/hpu_attn.py new file mode 100644 index 0000000000000..8b154f0964e2f --- /dev/null +++ b/vllm/v1/attention/backends/hpu_attn.py @@ -0,0 +1,362 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch +import vllm_hpu_extension.ops as ops +from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.backends.utils import CommonAttentionState +from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, + HPUPagedAttentionMetadata) +from vllm.logger import init_logger +from vllm.utils import is_fake_hpu + +logger = init_logger(__name__) + + +class HPUAttentionBackendV1(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "hpu-attn" + + @staticmethod + def get_impl_cls() -> Type["HPUAttentionImpl"]: + return HPUAttentionImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return HPUAttentionMetadata + + @staticmethod + def get_state_cls() -> Type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dsts: torch.Tensor, + ) -> None: + HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dsts: torch.Tensor, + ) -> None: + HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts) + + +@dataclass +class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): + """Metadata for HPUAttentionbackend.""" + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + attn_bias: Optional[torch.Tensor] + + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + seq_lens_tensor: Optional[torch.Tensor] + context_lens_tensor: Optional[torch.Tensor] + + @classmethod + def make_prefill_metadata(cls, seq_lens_tensor, num_prefills, + num_prefill_tokens, slot_mapping): + return cls(is_prompt=True, + block_list=None, + block_mapping=None, + block_usage=None, + block_indices=None, + block_offsets=None, + block_scales=None, + block_groups=None, + attn_bias=None, + num_decode_tokens=0, + context_lens_tensor=None, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=seq_lens_tensor, + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + slot_mapping=slot_mapping) + + @classmethod + def make_cached_prefill_metadata(cls, seq_lens_tensor, context_lens_tensor, + num_prefills, num_prefill_tokens, + slot_mapping, block_list): + return cls(is_prompt=True, + block_list=block_list, + block_mapping=None, + block_usage=None, + block_indices=None, + block_offsets=None, + block_scales=None, + block_groups=None, + attn_bias=None, + num_decode_tokens=0, + context_lens_tensor=context_lens_tensor, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=seq_lens_tensor, + num_prefills=num_prefills, + num_prefill_tokens=num_prefill_tokens, + slot_mapping=slot_mapping) + + @classmethod + def make_decode_metadata(cls, block_list, block_usage, block_groups, + num_decode_tokens, slot_mapping): + return cls(is_prompt=False, + block_mapping=None, + block_indices=None, + block_offsets=None, + block_scales=None, + attn_bias=None, + seq_lens_tensor=None, + context_lens_tensor=None, + num_prefills=0, + num_prefill_tokens=0, + multi_modal_placeholder_index_maps=None, + block_list=block_list, + block_usage=block_usage, + block_groups=block_groups, + num_decode_tokens=num_decode_tokens, + slot_mapping=slot_mapping) + + +class HPUAttentionImpl(AttentionImpl, torch.nn.Module): + """ + If the input tensors contain prompt tokens, the layout is as follows: + |<--------------- num_prefill_tokens ----------------->| + |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| + + Otherwise, the layout is as follows: + |<----------------- num_decode_tokens ------------------>| + |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| + + Generation tokens can contain padding when cuda-graph is used. + Currently, prompt tokens don't contain any padding. + + The prompts might have different lengths, while the generation tokens + always have length 1. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + max_seq_len: int = 4096, + ) -> None: + super(AttentionImpl, self).__init__() + self.kv_cache_dtype = kv_cache_dtype + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.matmul_qk = Matmul() + self.softmax = Softmax() + self.matmul_av = Matmul() + self.batch2block_matmul = Matmul() + self.block2batch_matmul = Matmul() + self.k_cache = VLLMKVCache() + self.v_cache = VLLMKVCache() + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + self.alibi_slopes = alibi_slopes + if alibi_slopes is not None: + alibi_slopes_tensor = torch.tensor(alibi_slopes, + dtype=torch.bfloat16) + self.alibi_slopes = alibi_slopes_tensor + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '1').lower() in ['1', 'true'] \ + and not is_fake_hpu() + if self.prefill_use_fusedsdpa: + assert alibi_slopes is None, \ + 'Prefill with FusedSDPA not supported with alibi slopes!' + + suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: HPUAttentionMetadata, + k_scale: float = 1.0, + v_scale: float = 1.0, + attn_type: AttentionType = AttentionType.DECODER, + ) -> torch.Tensor: + """Forward pass with xFormers and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "HPUAttentionImpl") + batch_size, seq_len, hidden_size = query.shape + _, seq_len_kv, _ = key.shape + + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + block_indices = attn_metadata.block_indices + block_offsets = attn_metadata.block_offsets + if attn_metadata.is_prompt: + key = key.unflatten(0, (block_indices.size(0), -1)) + value = value.unflatten(0, (block_indices.size(0), -1)) + if kv_cache is not None: + key_cache, value_cache = HPUPagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + key_cache = self.k_cache(key, key_cache, block_indices, + block_offsets) + value_cache = self.v_cache(value, value_cache, block_indices, + block_offsets) + + if attn_metadata.is_prompt: + # Prompt run. + query_shape = (batch_size, seq_len, self.num_heads, self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, + self.head_size) + if attn_metadata is None or attn_metadata.block_list is None: + if not self.prefill_use_fusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward' + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None: + position_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, + attn_bias.dtype, attn_bias.shape[-1]) + attn_bias = attn_bias.tile( + (1, self.num_kv_heads, 1, 1)) + attn_bias.add_(position_bias) + else: + attn_bias = None + + out = ops.prompt_attention( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, + ) + else: + # TODO: enable FusedSDPA + out = HPUPagedAttention.forward_prefix( + query=query.view(query_shape), + key=key.view(kv_shape), + value=value.view(kv_shape), + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + attn_bias=attn_metadata.attn_bias, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + softmax_op=self.softmax, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) + output = out.reshape(batch_size, seq_len, hidden_size) + else: + # Decoding run. + output = HPUPagedAttention.forward_decode( + query=query, + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + block_mapping=attn_metadata.block_mapping, + block_bias=attn_metadata.attn_bias, + block_scales=attn_metadata.block_scales, + block_groups=attn_metadata.block_groups, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + batch2block_matmul_op=self.batch2block_matmul, + block2batch_matmul_op=self.block2batch_matmul, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + seq_len: int, +) -> torch.Tensor: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + # Calculate a matrix where each element represents ith element- jth + # element. + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + 1, # batch size + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + return bias diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index b492a755e6dd5..e40ec4ab90197 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,10 +1,13 @@ +import os from collections import defaultdict from typing import Dict, List, Optional from vllm.logger import init_logger from vllm.utils import cdiv -from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, hash_block_tokens, +from vllm.v1.core.kv_cache_utils import (BlockHashType, + FreeKVCacheBlockHeapQueue, + FreeKVCacheBlockQueue, KVCacheBlock, + hash_block_tokens, hash_request_tokens) from vllm.v1.request import Request @@ -27,7 +30,8 @@ def __init__( self.max_model_len = max_model_len self.max_num_blocks_per_req = cdiv(max_model_len, block_size) self.sliding_window = sliding_window - self.enable_caching = enable_caching + self.enable_caching = os.environ.get('VLLM_ENABLE_PREFIX_CACHING', + 'true') in ['true', '1'] # NOTE(woosuk): To avoid frequent block allocation, we preallocate some # blocks for each request. For example, when a request reaches the end # of its block table, we preallocate N blocks in advance. This way, we @@ -43,12 +47,14 @@ def __init__( # A Block pool of all kv-cache blocks. self.block_pool: List[KVCacheBlock] = [ - KVCacheBlock(idx) for idx in range(num_gpu_blocks) + KVCacheBlock(idx) for idx in range(1, num_gpu_blocks) ] # Free block queue that constructs and manipulates a doubly linked # list of free blocks (including eviction candidates when caching is # enabled). - self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool) + block_queue_impl = FreeKVCacheBlockHeapQueue if os.environ.get( + 'VLLM_USE_HEAPQ') in ['1', 'true'] else FreeKVCacheBlockQueue + self.free_block_queue = block_queue_impl(self.block_pool) # {block_hash: {block ID: block}}. A cached block is # a full block with a block hash that can be used for prefix caching. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index fb666c364bfb2..b8b771a4234d5 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,6 +1,7 @@ """KV-Cache Utilities.""" +import heapq from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from vllm.logger import init_logger @@ -46,6 +47,63 @@ def reset_hash(self): self._block_hash = None +class FreeKVCacheBlockHeapQueue: + """TODO(kzawora): document this + + Args: + blocks: A list of KVCacheBlock objects. + """ + + def __init__(self, blocks: List[KVCacheBlock]) -> None: + self.num_free_blocks = len(blocks) + self._free_block_indices: List[KVCacheBlock] = blocks[:] + self.tombstone: Dict[KVCacheBlock, int] = {} + heapq.heapify(self._free_block_indices) + assert len(self._free_block_indices) == self.num_free_blocks + + def popleft(self) -> KVCacheBlock: + """Pop the first free block and reduce num_free_blocks by 1. + + Returns: + The first free block. + """ + block: KVCacheBlock = heapq.heappop(self._free_block_indices) + #logger.info(f'[HEAPQ] Popped block {block.block_id}') + return block + + def remove(self, block: KVCacheBlock) -> None: + """Remove a block in the free list and reduce num_free_blocks by 1. + + Args: + block: The block to remove. + """ + self.tombstone[block] = self.tombstone.get(block, 0) + 1 + while len(self._free_block_indices) > 0 and self._free_block_indices[ + 0] == block and self.tombstone[block] > 0: + heapq.heappop(self._free_block_indices) + self.tombstone[block] -= 1 + + self.num_free_blocks -= 1 + + def append(self, block: KVCacheBlock) -> None: + """Put a block back into the free list and increase + num_free_blocks by 1. + + Args: + block: The block to append. + """ + heapq.heappush(self._free_block_indices, block) + self.num_free_blocks += 1 + + def get_all_free_blocks(self) -> List[KVCacheBlock]: + """Get all free blocks in the free list. Mainly used for testing. + + Returns: + A list of free blocks. + """ + return list(item for item in self._free_block_indices) + + class FreeKVCacheBlockQueue: """This class organizes a list of KVCacheBlock objects to a doubly linked list of free blocks. We implement this class instead of using Python @@ -91,6 +149,7 @@ def popleft(self) -> KVCacheBlock: block = self.free_list_head self.remove(block) + #logger.info(f'[LL] Popped block {block.block_id}') return block def remove(self, block: KVCacheBlock) -> None: diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index a3e85c20cc664..961cdbcbd3c56 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -29,9 +29,14 @@ def __init__( cache_config: CacheConfig, lora_config: Optional[LoRAConfig], ) -> None: + # TODO: properly handle for HPU. + # cache_config.enable_prefix_caching = False + # scheduler_config.chunked_prefill_enabled = False + self.scheduler_config = scheduler_config self.cache_config = cache_config self.lora_config = lora_config + self.disable_prefill_chunking = True # TODO: Support LoRA. assert lora_config is None, "V1 does not support LoRA yet." @@ -203,6 +208,12 @@ def schedule(self) -> "SchedulerOutput": num_computed_tokens -= 1 num_new_tokens = 1 computed_blocks.pop() + + # If chunked prefill is not enabled, breakout of the loop. + if (not self.scheduler_config.chunked_prefill_enabled + and num_new_tokens > token_budget): + break + num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 @@ -524,16 +535,14 @@ def from_request( block_ids: List[int], num_computed_tokens: int, ) -> "NewRequestData": - return cls( - req_id=request.request_id, - prompt_token_ids=request.prompt_token_ids, - prompt=request.prompt, - mm_inputs=request.mm_inputs, - mm_positions=request.mm_positions, - sampling_params=request.sampling_params, - block_ids=block_ids, - num_computed_tokens=num_computed_tokens, - ) + return cls(req_id=request.request_id, + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + mm_inputs=request.mm_inputs, + mm_positions=request.mm_positions, + sampling_params=request.sampling_params, + block_ids=block_ids, + num_computed_tokens=num_computed_tokens) @dataclass @@ -550,11 +559,9 @@ def from_request( block_ids: List[int], num_computed_tokens: int, ) -> "ResumedRequestData": - return cls( - req_id=request.request_id, - block_ids=block_ids, - num_computed_tokens=num_computed_tokens, - ) + return cls(req_id=request.request_id, + block_ids=block_ids, + num_computed_tokens=num_computed_tokens) @dataclass @@ -571,11 +578,9 @@ def from_request( new_block_ids: List[int], num_computed_tokens: int, ) -> "RunningRequestData": - return cls( - req_id=request.request_id, - new_block_ids=new_block_ids, - num_computed_tokens=num_computed_tokens, - ) + return cls(req_id=request.request_id, + new_block_ids=new_block_ids, + num_computed_tokens=num_computed_tokens) @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 26fd650aee4b7..4266444192024 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -10,6 +10,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -129,13 +130,27 @@ def shutdown(self): def _get_executor_cls(cls, vllm_config: VllmConfig): distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + executor_class: Type[Executor] + if current_platform.is_cuda_alike(): + if distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import ( + MultiprocExecutor) + executor_class = MultiprocExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_executor import UniprocExecutor + executor_class = UniprocExecutor + elif current_platform.is_hpu(): + if distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_hpu_executor import ( + MultiprocHPUExecutor) + executor_class = MultiprocHPUExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_hpu_executor import ( + UniprocHPUExecutor) + executor_class = UniprocHPUExecutor + return executor_class async def add_request( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1b3a9f12d009e..16a66848e9bb3 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -106,13 +107,26 @@ def from_engine_args( def _get_executor_cls(cls, vllm_config: VllmConfig): distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + executor_class: Type[Executor] + if current_platform.is_cuda_alike(): + if distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import ( + MultiprocExecutor) + executor_class = MultiprocExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_executor import UniprocExecutor + executor_class = UniprocExecutor + elif current_platform.is_hpu(): + if distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_hpu_executor import ( + MultiprocHPUExecutor) + executor_class = MultiprocHPUExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_hpu_executor import ( + UniprocHPUExecutor) + executor_class = UniprocHPUExecutor return executor_class diff --git a/vllm/v1/executor/multiproc_hpu_executor.py b/vllm/v1/executor/multiproc_hpu_executor.py new file mode 100644 index 0000000000000..d8b40be7c9442 --- /dev/null +++ b/vllm/v1/executor/multiproc_hpu_executor.py @@ -0,0 +1,11 @@ +from vllm.v1.executor.multiproc_executor import MultiprocExecutor + + +class MultiprocHPUExecutor(MultiprocExecutor): + + def initialize(self, num_hpu_blocks: int) -> None: + """ + Initialize the KV caches and begin the model execution loop of the + underlying workers. + """ + self.collective_rpc("initialize_cache", args=(num_hpu_blocks, )) \ No newline at end of file diff --git a/vllm/v1/executor/uniproc_hpu_executor.py b/vllm/v1/executor/uniproc_hpu_executor.py new file mode 100644 index 0000000000000..aeea725b1a8f4 --- /dev/null +++ b/vllm/v1/executor/uniproc_hpu_executor.py @@ -0,0 +1,78 @@ +from typing import Optional, Tuple + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.hpu_worker import HPUWorker + +logger = init_logger(__name__) + + +class UniprocHPUExecutor: + + def __init__(self, vllm_config: VllmConfig) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + + self.worker = self._create_worker() + self.worker.initialize() + self.worker.load_model() + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None) -> HPUWorker: + """Return worker init args for a given rank.""" + + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + + return HPUWorker( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + ) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.worker.determine_num_available_blocks() + + def initialize_cache(self, num_hpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# HPU blocks: %d", num_hpu_blocks) + from vllm_hpu_extension.profiler import HabanaMemoryProfiler + with HabanaMemoryProfiler() as cache_init_m: + self.worker.initialize_cache(num_hpu_blocks) + msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" + logger.info(msg) + + def execute_model( + self, + scheduler_output, + ) -> ModelRunnerOutput: + output = self.worker.execute_model(scheduler_output) + return output + + def check_health(self) -> None: + # HPUExecutor will always be healthy as long as + # it's running. + return \ No newline at end of file diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index d1a755be01ff7..f0ace51483bf0 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -20,10 +20,11 @@ def forward( logits = self.apply_temperature(logits, sampling_metadata.temperature) logits = self.apply_top_k_top_p(logits, sampling_metadata) - probs = self.get_probs(logits) + probs = self.get_probs( + logits) if not sampling_metadata.all_greedy else logits sampled = self.sample(probs, sampling_metadata) # Use int32 to reduce the tensor size. - sampled = sampled.to(torch.int32) + sampled = sampled # .to(torch.int32) NOTE(kzawora): WHY DO WE HAVE AN UNDEFINED BEHAVIOR HERE?! IN WHICH WORLD DOES 75696 INT64 CAST TO -828218624 INT32?!? HOW CAN ARGMAX EVEN RETURN -828218624?! >_< # NOQA if sampling_metadata.max_num_logprobs > 0: logprobs = self.get_logprobs(logits) diff --git a/vllm/v1/worker/hpu_model_runner.py b/vllm/v1/worker/hpu_model_runner.py new file mode 100755 index 0000000000000..c7e610ad7c8c1 --- /dev/null +++ b/vllm/v1/worker/hpu_model_runner.py @@ -0,0 +1,2031 @@ +import collections +import contextlib +import functools +import itertools +import math +import os +import time +from dataclasses import dataclass +from enum import Enum +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, + Tuple, Union) + +import habana_frameworks.torch as htorch +import habana_frameworks.torch.internal.bridge_config as bc +import numpy as np +import torch +import torch.distributed +from vllm_hpu_extension.bucketing import HPUBucketingContext +from vllm_hpu_extension.ops import batch2block, block2batch +from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes + +from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_fake_hpu, + is_pin_memory_available) +from vllm.v1.attention.backends.hpu_attn import (HPUAttentionBackendV1, + HPUAttentionMetadata) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.sample.metadata import SamplingMetadata + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + +from vllm.v1.engine.detokenizer import Detokenizer + +logger = init_logger(__name__) + +_TYPE_CACHE = {} +# These values are assumed to be zero in several places. +# Use caution when updating them! +_PAD_SLOT_ID = 0 +_PAD_BLOCK_ID = 0 + + +class PhaseType(Enum): + UNCACHED_PREFILL = 'uncached_prefill' + CACHED_PREFILL = 'cached_prefill' + DECODE = 'decode' + + +@dataclass +class PrefillInputData: + + request_ids: List + prompt_lens: List + token_ids: List + position_ids: List + attn_metadata: List + logits_indices: List + + def zipped(self): + return zip(self.request_ids, self.prompt_lens, self.token_ids, + self.position_ids, self.attn_metadata, self.logits_indices) + + +@dataclass +class DecodeInputData: + + num_decodes: int + token_ids: Optional[torch.Tensor] = None + position_ids: Optional[torch.Tensor] = None + attn_metadata: HPUAttentionMetadata = None + logits_indices: Optional[torch.Tensor] = None + + +def flatten(in_list): + return list(itertools.chain(*in_list)) + + +def gather_list(input, indices, v): + return [input[i] if i is not None else v for i in indices] + + +def _async_h2d_tensor_copy(source, device='hpu'): + assert source.device.type == 'cpu', \ + "Source tensor is not present in host memory!" + target = torch.empty(source.shape, dtype=source.dtype, device=device) + target.copy_(source, non_blocking=True) + return target + + +class HpuModelAdapter: + + def __init__(self, model, block_size, dtype, enforce_eager): + self.model = model + self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '1').lower() in ['1', 'true'] \ + and not is_fake_hpu() + self.block_size = block_size + self.dtype = dtype + if not is_fake_hpu() and not htorch.utils.internal.is_lazy( + ) and not enforce_eager: + self.model = torch.compile(self.model, + backend='hpu_backend', + dynamic=False) + + def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, + dtype): + prefix_caching_enabled = attn_metadata.is_prompt and \ + attn_metadata.block_list is not None + if (attn_metadata is None + or (self.prefill_use_fusedsdpa and not prefix_caching_enabled) + or not attn_metadata.is_prompt): + return attn_metadata + + prefill_metadata = attn_metadata + + context_lens_t = prefill_metadata.context_lens_tensor + query_lens_t = prefill_metadata.seq_lens_tensor + + block_list = attn_metadata.block_list + max_context_len = (block_list.size(-1) // + batch_size if block_list is not None else 0) + max_context_len = max_context_len * self.block_size + past_mask = torch.arange(0, + max_context_len, + dtype=torch.int32, + device=device) + past_mask = (past_mask.view(1, -1).expand(batch_size, -1).ge( + context_lens_t.view(-1, 1)).view(batch_size, 1, -1).expand( + batch_size, seq_len, -1).view(batch_size, 1, seq_len, -1)) + + len_mask = (torch.arange(0, seq_len, device=device, + dtype=torch.int32).view(1, seq_len).ge( + query_lens_t.unsqueeze(-1)).view( + batch_size, 1, 1, seq_len)) + causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), + device=device, + dtype=torch.bool), + diagonal=1) + mask = causal_mask.logical_or(len_mask) + mask = torch.concat((past_mask, mask), dim=-1) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) + attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) + return attn_metadata + + def _set_block_mapping(self, metadata, batch_size, device, dtype): + mask = torch.arange(0, + self.block_size, + device=device, + dtype=torch.int32).unsqueeze(0) + mask = mask >= metadata.block_usage.unsqueeze(-1) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) + + if not is_fake_hpu() and htorch.utils.internal.is_lazy(): + block_mapping = torch.nn.functional.one_hot(metadata.block_groups, + num_classes=batch_size) + else: + # Unfortunately one_hot on CPU/torch.compile mode/eager mode + # doesn't handle out of bounds classes so we need to convert + # all negative values to 0 (block_mapping) or bs (block_groups) + block_groups = metadata.block_groups.to(torch.long) + block_mapping = torch.nn.functional.relu(block_groups) + block_mapping = torch.nn.functional.one_hot(block_mapping, + num_classes=batch_size) + oob_values = block_groups.lt(0) + block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) + block_groups.masked_fill_(oob_values, batch_size) + metadata = metadata._replace(block_groups=block_groups) + block_mapping = block_mapping.to(dtype) + metadata = metadata._replace(block_mapping=block_mapping, + attn_bias=attn_bias) + return metadata + + def _set_block_scales(self, metadata, device): + block_mapping = metadata.block_mapping + ones = torch.ones((block_mapping.size(0), ), + device=device, + dtype=block_mapping.dtype) + sums = batch2block(block2batch(ones, block_mapping), block_mapping) + block_scales = torch.reciprocal(torch.maximum(ones, sums)) + metadata = metadata._replace(block_scales=block_scales) + return metadata + + def _set_indices_and_offsets(self, metadata, block_size, is_prompt): + slot_mapping = metadata.slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = indices.unflatten(0, (-1, block_size))[:, 0] + offsets = None + else: + offsets = torch.fmod(slot_mapping, block_size) + metadata = metadata._replace(block_offsets=offsets, + block_indices=indices) + return metadata + + def _update_metadata(self, attn_metadata, batch_size, seq_len, device, + dtype): + if attn_metadata.is_prompt: + attn_metadata = self._set_attn_bias(attn_metadata, batch_size, + seq_len, device, dtype) + else: + attn_metadata = self._set_block_mapping(attn_metadata, batch_size, + device, dtype) + attn_metadata = self._set_block_scales(attn_metadata, device) + attn_metadata = self._set_indices_and_offsets(attn_metadata, + self.block_size, + attn_metadata.is_prompt) + return attn_metadata + + def forward(self, *args, **kwargs): + kwargs = kwargs.copy() + input_ids = kwargs['input_ids'] + kwargs['attn_metadata'] = self._update_metadata( + kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), + input_ids.device, self.dtype) + hidden_states = self.model(*args, **kwargs) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + return hidden_states + + def compute_logits(self, *args, **kwargs): + return self.model.compute_logits(*args, **kwargs) + + def sample(self, *args, **kwargs): + return self.model.sample(*args, **kwargs) + + def generate_proposals(self, *args, **kwargs): + return self.model.generate_proposals(*args, **kwargs) + + # sampler property will be used by spec_decode_worker + # don't rename + @property + def sampler(self): + return self.model.sampler + + +def _maybe_wrap_in_hpu_graph(*args, **kwargs): + return htorch.hpu.wrap_in_hpu_graph( + HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True + ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) + + +def subtuple(obj: object, + typename: str, + to_copy: List[str], + to_override: Optional[Dict[str, object]] = None): + if obj is None: + return None + if to_override is None: + to_override = {} + fields = set(to_copy) | set(to_override.keys()) + if type(obj) is dict: + values = {key: obj[key] for key in fields if key in obj} + else: + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if typename not in _TYPE_CACHE: + _TYPE_CACHE[typename] = collections.namedtuple(typename, + ' '.join(fields)) + return _TYPE_CACHE[typename](**values) + + +def trim_attn_metadata(metadata: HPUAttentionMetadata) -> object: + # NOTE(kzawora): To anyone working on this in the future: + # Trimming metadata is required when using HPUGraphs. + # Attention metadata is going to be hashed by PT bridge, and + # appropriate HPUGraphs will be matched based on all inputs' hash. + + # Before you put more keys in here, make sure you know their + # value type and make sure you know how it's going to be hashed. + # You can find that information in input_hash function + # in habana_frameworks/torch/hpu/graphs.py. You can also hash + # it manually with torch.hpu.graphs.input_hash(attention_metadata) + + # If you use primitive types here - they will get hashed based + # on their value. You *will* get lots of excessive graph captures + # (and an OOM eventually) if you decide to put something like + # seq_len int here. + # If you absolutely need a scalar, put it in a tensor. Tensors + # get hashed using their metadata, not their values: + # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) + # input_hash(123) != input_hash(321) + # input_hash("abc") != input_hash("cba") + attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ + 'attn_bias', 'seq_lens_tensor', 'context_lens_tensor', 'block_list', + 'block_mapping', 'block_usage', 'slot_mapping', 'is_prompt', + 'block_indices', 'block_offsets', 'block_scales', 'block_groups' + ]) + return attention_metadata + + +def next_pow2(value: int, base: int): + res = base + while value > 1: + value = (value + 1) // 2 + res *= 2 + return res + + +def round_up(value: int, k: int): + return (value + k - 1) // k * k + + +def pad_list(list, k, v): + target_len = round_up(len(list), k) + padding = target_len - len(list) + return list + [v] * padding + + +def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = indices.unflatten(0, (-1, block_size))[:, 0] + offsets = None + else: + offsets = torch.fmod(slot_mapping, block_size) + return indices, offsets + + +class HPUModelRunner: + + def __init__( + self, + vllm_config: VllmConfig, + ): + #TODO(kzawora): remove this, this is ugly and only used for diagnostics + self._ENGINE_ITER = 0 + # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config) + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + #TODO(kzawora): remove this, this is for debug purposes only + self._tokenizer = Detokenizer( + vllm_config.model_config.tokenizer).tokenizer + model_config = self.model_config + cache_config = self.cache_config + scheduler_config = self.scheduler_config + parallel_config = self.parallel_config + self.device = self.device_config.device + self.pin_memory = is_pin_memory_available() + self.dtype = self.model_config.dtype + if cache_config.cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + cache_config.cache_dtype] + + self.sliding_window = model_config.get_sliding_window() + self.block_size = cache_config.block_size + self.max_model_len = model_config.max_model_len + self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) + self.max_num_tokens = scheduler_config.max_num_batched_tokens + + # Model-related. + self.num_attn_layers = model_config.get_num_attention_layers( + parallel_config) + self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.head_size = model_config.get_head_size() + + # Lazy initialization + # self.model: nn.Module # Set after load_model + self.kv_caches: List[torch.Tensor] = [] + + # Request states. + self.requests: Dict[str, CachedRequestState] = {} + # Persistent batch. + self.input_batch = InputBatch( + max_num_reqs=self.scheduler_config.max_num_seqs, + max_model_len=self.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + device=self.device, + pin_memory=self.pin_memory, + ) + + self.use_hpu_graph = not self.model_config.enforce_eager + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. + self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] + self.max_batch_size = 256 # TODO(kzawora): fix this garbage + self.input_ids = torch.zeros( + (self.max_batch_size, self.max_num_tokens), + dtype=torch.int32, + device=self.device) + self.positions = torch.zeros( + (self.max_batch_size, self.max_num_tokens), + dtype=torch.int64, + device=self.device) + self.prefill_positions = torch.tensor( + range(self.max_model_len), + device="cpu", + ).to(torch.int32).reshape(1, -1) + + self.max_num_seqs = self.scheduler_config.max_num_seqs + self.max_prefill_batch_size = 16 # TODO(kzawora): add knob for that + self.padding_aware_scheduling = True # TODO(kzawora): add knob for that + self.padding_ratio_threshold = 0.9 # TODO(kzawora): add knob for that + self.use_contiguous_pa = os.environ.get('VLLM_CONTIGUOUS_PA', + 'true').lower() == 'true' + self.seen_configs: set = set() + self.enable_bucketing = os.environ.get( + 'VLLM_DISABLE_BUCKETING', 'false').lower() not in ['true', '1'] + if self.enable_bucketing: + logger.info("Bucketing is ON.") + self.bucketing_ctx = HPUBucketingContext( + self.max_num_seqs, self.max_prefill_batch_size, + self.block_size, self.scheduler_config.max_num_batched_tokens) + self.graphed_buckets: Set[Any] = set() + else: + logger.info("Bucketing is OFF.") + self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP', + 'false').lower() == 'true' + + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + # Remove stopped requests from the cached states. + # Keep the states of the pre-empted requests. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + + # Remove the requests from the persistent batch. + stopped_req_ids = set().union( + scheduler_output.preempted_req_ids, + scheduler_output.finished_req_ids, + ) + removed_req_indices: List[int] = [] + for req_id in stopped_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) + + # Update the states of the running requests. + for req_data in scheduler_output.scheduled_running_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + req_index = self.input_batch.req_id_to_index[req_id] + + # Update the num_computed_tokens. + req_state.num_computed_tokens = req_data.num_computed_tokens + self.input_batch.num_computed_tokens_cpu[req_index] = ( + req_data.num_computed_tokens) + + # Update the block table. + num_new_blocks = len(req_data.new_block_ids) + if num_new_blocks == 0: + continue + start_index = len(req_state.block_ids) + end_index = start_index + num_new_blocks + req_state.block_ids.extend(req_data.new_block_ids) + self.input_batch.block_table_cpu[ + req_index, start_index:end_index] = req_data.new_block_ids + + req_ids_to_add: List[str] = [] + # Add new requests to the cached states. + for req_data in scheduler_output.scheduled_new_reqs: + req_id = req_data.req_id + sampling_params = req_data.sampling_params + if sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=req_data.prompt_token_ids, + prompt=req_data.prompt, + mm_inputs=req_data.mm_inputs, + mm_positions=req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, + block_ids=req_data.block_ids, + num_computed_tokens=req_data.num_computed_tokens, + output_token_ids=[], + ) + req_ids_to_add.append(req_id) + + # Update the cached states of the resumed requests. + for req_data in scheduler_output.scheduled_resumed_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + + req_state.block_ids = req_data.block_ids + req_state.num_computed_tokens = req_data.num_computed_tokens + req_ids_to_add.append(req_id) + + # THIS MOVES ALL THE DECODES TO THE FIRST N IN BATCH. + # Condense the batched states if there are empty indices. + removed_req_indices = sorted(removed_req_indices, reverse=True) + if removed_req_indices: + self.input_batch.condense(removed_req_indices) + + # ALL THE PREFILLS ARE THE LAST M IN THE BATCH. + # These are added at the end after the bacth is condensed. + self.input_batch.num_prefills = len(req_ids_to_add) + for req_id in req_ids_to_add: + req_state = self.requests[req_id] + self.input_batch.add_request(req_state, None) + + def _prepare_sampling(self, + scheduler_output: "SchedulerOutput", + start_idx: Optional[int] = None, + end_idx: Optional[int] = None, + pad_to: Optional[int] = None) -> SamplingMetadata: + skip_copy = True + if start_idx is None and end_idx is None: + if (scheduler_output.finished_req_ids + or scheduler_output.preempted_req_ids): + skip_copy = False + if (scheduler_output.scheduled_new_reqs + or scheduler_output.scheduled_resumed_reqs): + skip_copy = False + else: + #TODO(kzawora): something smells... kinda fishy in here + req_ids = self.input_batch.req_ids[start_idx:end_idx] + finished_req_ids = any([ + req_id in scheduler_output.finished_req_ids + for req_id in req_ids + ]) + preempted_req_ids = any([ + req_id in scheduler_output.preempted_req_ids + for req_id in req_ids + ]) + scheduled_new_reqs = any([ + req_id in scheduler_output.scheduled_new_reqs + for req_id in req_ids + ]) + scheduled_resumed_reqs = any([ + req_id in scheduler_output.scheduled_resumed_reqs + for req_id in req_ids + ]) + + if (finished_req_ids or preempted_req_ids): + skip_copy = False + if (scheduled_new_reqs or scheduled_resumed_reqs): + skip_copy = False + + # Create the sampling metadata. + sampling_metadata = self.input_batch.make_sampling_metadata( + skip_copy=skip_copy, + start_idx=start_idx, + end_idx=end_idx, + pad_to=pad_to) + return sampling_metadata + + def get_habana_paged_attn_buffers(self, + block_tables, + slot_mapping, + bucketing=True): + + last_block_usage = [ + slot[0] % self.block_size + 1 for slot in slot_mapping + ] + block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)] + block_usage = [[self.block_size] * (len(bt) - 1) + [lbu] + for bt, lbu in zip(block_tables, last_block_usage) + if bt] + + block_list = flatten(block_tables) + block_groups = flatten(block_groups) + block_usage = flatten(block_usage) + assert len(block_list) == len(block_groups) + assert len(block_list) == len(block_usage) + + padding_fn = None + if self.use_contiguous_pa: + block_bucket_size = max(max(block_list) + 1, len(block_list)) + if bucketing: + block_bucket_size = \ + self.bucketing_ctx.get_padded_decode_num_blocks( + block_bucket_size) + indices: List[Any] + indices = [None] * block_bucket_size + for i, bid in enumerate(block_list): + indices[bid] = i + padding_fn = lambda tensor, pad_value: gather_list( + tensor, indices, pad_value) + else: + block_bucket_size: int + if bucketing: + block_bucket_size = \ + self.bucketing_ctx.get_padded_decode_num_blocks( + len(block_list)) + else: + block_bucket_size = len(block_list) + padding_fn = lambda tensor, pad_value: pad_list( + tensor, block_bucket_size, pad_value) + + block_list = padding_fn(block_list, _PAD_BLOCK_ID) + block_groups = padding_fn(block_groups, -1) + block_usage = padding_fn(block_usage, 1) + + block_list = torch.tensor(block_list, dtype=torch.long, device='cpu') + block_groups = torch.tensor(block_groups, + dtype=torch.long, + device='cpu') + block_usage = torch.tensor(block_usage, + dtype=self.model_config.dtype, + device='cpu') + + return block_list, block_groups, block_usage + + def _get_padded_prefill_dims(self, num_prefills, max_prompt_len, + bucketing): + if bucketing: + padded_batch_size = self.bucketing_ctx.get_padded_batch_size( + num_prefills, True) + padded_prompt_len = self.bucketing_ctx.get_padded_prompt_seq_len( + max_prompt_len) + else: + #NOTE(kzawora): On HPU prompt length needs to be block_size + # aligned, so we're padding to that, even if bucketing + # is disabled. + padded_batch_size = num_prefills + padded_prompt_len = math.ceil( + max_prompt_len / self.block_size) * self.block_size + assert padded_prompt_len <= self.max_model_len + return padded_batch_size, padded_prompt_len + + def _prepare_prefill_inputs(self, + num_scheduled_tokens: List[int], + bucketing=True) -> PrefillInputData: + # Each prefill run separately with shape [1, padded_prompt_len]. + # So we create lists that will be used in execute_model(). + + prefill_request_ids = [] + prefill_prompt_lens = [] + prefill_token_ids = [] + prefill_position_ids = [] + prefill_attn_metadata = [] + prefill_logits_indices = [] + + # DECODES are the first num_decodes REQUESTS. + # PREFILLS are the next num_reqs - num_decodes REQUESTS. + num_reqs = self.input_batch.num_reqs + num_decodes = self.input_batch.num_decodes + # NOTE(kzawora): This loop was initially implemented as + # for batch_idx in range(num_decodes, num_reqs, max_prefill_batch_size) + # but was changed to accommodate variable loop step size for + # padding-aware scheduling + batch_idx = num_decodes + while batch_idx < num_reqs: + # Find the largest batch size in range [1, max_prefill_batch_size] + # that can fit within specified token budget + num_prefills: int + padded_batch_size: int + padded_prompt_len: int + padded_num_tokens: int + padding_ratio: float + batch_req_ids: List[int] + prompt_lens: List[int] + for possible_batch_size in reversed( + range(1, self.max_prefill_batch_size + 1)): + if batch_idx + possible_batch_size > num_reqs: + continue + num_prefills = possible_batch_size + batch_req_ids = self.input_batch.req_ids[batch_idx:batch_idx + + num_prefills] + prompt_lens = num_scheduled_tokens[batch_idx:batch_idx + + num_prefills] + max_prompt_len = max(prompt_lens) + num_tokens = sum(prompt_lens) + padded_batch_size, padded_prompt_len = \ + self._get_padded_prefill_dims(num_prefills, + max_prompt_len, bucketing) + padded_num_tokens = padded_batch_size * padded_prompt_len + padding_ratio = 1 - (num_tokens / padded_num_tokens) + is_within_token_budget = padded_batch_size * padded_prompt_len \ + < self.scheduler_config.max_num_batched_tokens + is_within_padding_ratio_threshold = padding_ratio < \ + self.padding_ratio_threshold + can_schedule = is_within_token_budget and \ + is_within_padding_ratio_threshold + # If padding aware scheduling is off, we'll break on the first + # loop iteration (==max_prefill_batch_size). + # Else, we'll break on first batch size that fits token budget. + if not self.padding_aware_scheduling or can_schedule: + break + context_lens = self.input_batch.num_computed_tokens_cpu[ + batch_idx:batch_idx + num_prefills] + use_prefix_caching = any(context_lens) + # TODO(kzawora): this is an ugly hack for prefix caching, remove + # that once batch padding works properly (idk why it doesn't) + if use_prefix_caching: + padded_batch_size = num_prefills + + padded_prompt_lens = [ + padded_prompt_len for _ in range(padded_batch_size) + ] + + # TOKEN_IDS. + token_ids = torch.zeros((padded_batch_size, padded_prompt_len), + dtype=torch.int32, + device='cpu') + # POSITIONS. + positions = torch.zeros((padded_batch_size, padded_prompt_len), + dtype=torch.int32, + device='cpu') + # SLOT_MAPPING. + # The "slot" is the "physical index" of a token in the KV cache. + # Look up the block_idx in the block table (logical<>physical map) + # to compute this. + slot_mapping = torch.zeros((padded_batch_size, padded_prompt_len), + dtype=torch.int32, + device='cpu') + slot_mapping.fill_(_PAD_SLOT_ID) + + for i, (prompt_len, + context_len) in enumerate(zip(prompt_lens, context_lens)): + # Prepare and sanitize token ids (cpu) + batch_offset = batch_idx + i + token_ids[i, :prompt_len] = torch.from_numpy( + self.input_batch.token_ids_cpu[batch_offset, + context_len:context_len + + prompt_len]) + #token_ids[i, prompt_len:] = 0 # no need to sanitize - buffer + # is pre-filled with 0s + + # Prepare and sanitize positions ids (cpu) + positions[ + i, :prompt_len] = self.prefill_positions[:, context_len: + context_len + + prompt_len] + #positions[i, prompt_len:] = 0 # no need to sanitize - buffer + # is pre-filled with 0s + + # Prepare and sanitize slot_mapping (cpu) + flat_prefill_positions = positions[i, :prompt_len].flatten() + block_numbers = self.input_batch.block_table_cpu_tensor[ + batch_offset, flat_prefill_positions // self.block_size] + block_offsets = flat_prefill_positions % self.block_size + slot_mapping[ + i, : + prompt_len] = block_numbers * self.block_size + \ + block_offsets + #slot_mapping[i, prompt_len:] = _PAD_SLOT_ID # no need to + # sanitize - buffer is pre-filled with _PAD_SLOT_IDs + slot_mapping = slot_mapping.long() + + logits_indices = torch.zeros(padded_batch_size, + dtype=torch.int32, + device='cpu') + query_start_loc = torch.empty((num_prefills + 1, ), + dtype=torch.int32, + device="cpu") + query_start_loc_np = query_start_loc.numpy() + query_start_loc_np[0] = 0 + + # logits indices in prefill must account for padding: last + # token logits will be emitted at index + # (idx - 1) * padded_seq_len + seq_len[idx] - 1 + np.cumsum(padded_prompt_lens[:num_prefills], + out=query_start_loc_np[1:]) + query_start_loc_np[:num_prefills] += num_scheduled_tokens[ + batch_idx:batch_idx + num_prefills] + logits_indices[:num_prefills] = query_start_loc[:num_prefills] - 1 + + # HPU should *not* sync here with CPU + seq_lens_tensor = torch.zeros((padded_batch_size), + dtype=torch.int32, + device='cpu') + seq_lens_tensor[:num_prefills] = torch.tensor(prompt_lens, + device='cpu') + token_ids_device = _async_h2d_tensor_copy(token_ids, self.device) + positions_device = _async_h2d_tensor_copy(positions, self.device) + seq_lens_tensor_device = _async_h2d_tensor_copy( + seq_lens_tensor, self.device) + slot_mapping_device = _async_h2d_tensor_copy( + slot_mapping, self.device) + logits_indices_device = _async_h2d_tensor_copy( + logits_indices, self.device) + + prefill_request_ids.append(batch_req_ids) + prefill_prompt_lens.append(prompt_lens) + prefill_token_ids.append(token_ids_device) + prefill_position_ids.append(positions_device) + prefill_logits_indices.append(logits_indices_device) + attn_metadata = None + if use_prefix_caching: + # Prefix caching + num_blocks = np.ceil(context_lens / self.block_size).astype( + np.int32).tolist() + max_num_blocks = max(num_blocks) + #if bucketing: + # max_num_blocks = self.bucketing_ctx.get_padded_decode_num_blocks(max_num_blocks) # noqa + prefix_block_tables = torch.zeros( + (padded_batch_size, max_num_blocks), + dtype=torch.int32, + device='cpu') + for i, n in enumerate(num_blocks): + prefix_block_tables[ + i, :n] = self.input_batch.block_table_cpu_tensor[i, :n] + context_lens_tensor = torch.zeros((padded_batch_size), + dtype=torch.int32, + device='cpu') + context_lens_tensor[:num_prefills] = torch.tensor(context_lens, + device='cpu') + + block_list_device = _async_h2d_tensor_copy( + prefix_block_tables.flatten(), self.device) + context_lens_tensor_device = _async_h2d_tensor_copy( + context_lens_tensor, self.device) + attn_metadata = \ + HPUAttentionMetadata.make_cached_prefill_metadata( + seq_lens_tensor=seq_lens_tensor_device, + context_lens_tensor=context_lens_tensor_device, + num_prefills=num_prefills, + num_prefill_tokens=sum(prompt_lens), + slot_mapping=slot_mapping_device, + block_list=block_list_device) + else: + attn_metadata = HPUAttentionMetadata.make_prefill_metadata( + seq_lens_tensor=seq_lens_tensor_device, + num_prefills=num_prefills, + num_prefill_tokens=sum(prompt_lens), + slot_mapping=slot_mapping_device, + ) + #import pdb; pdb.set_trace() + # ATTN_METADATA. + prefill_attn_metadata.append(attn_metadata) + batch_idx += num_prefills + + return PrefillInputData(request_ids=prefill_request_ids, + prompt_lens=prefill_prompt_lens, + token_ids=prefill_token_ids, + position_ids=prefill_position_ids, + attn_metadata=prefill_attn_metadata, + logits_indices=prefill_logits_indices) + + def _prepare_decode_inputs(self, + num_scheduled_tokens, + bucketing=True) -> DecodeInputData: + # Decodes run as one single padded batch with shape [batch, 1] + # + # We need to set _PAD_SLOT_ID for the padding tokens in the + # slot_mapping, such that the attention KV cache insertion + # logic knows to ignore those indicies. Otherwise, the + # padding data can be dummy since we have a causal mask. + + # num_reqs = self.input_batch.num_reqs + num_decodes = self.input_batch.num_decodes + if num_decodes == 0: + return DecodeInputData(num_decodes=0) + + # PAD FOR STATIC SHAPES. + padded_batch_size: int + if bucketing: + padded_batch_size = self.bucketing_ctx.get_padded_batch_size( + num_decodes, False) + else: + padded_batch_size = num_decodes + + # POSITIONS. [batch, 1] + # We slice at the end, since we use the positions for gathering. + positions = torch.from_numpy( + self.input_batch.num_computed_tokens_cpu.reshape(-1, 1)) + index = positions.to(torch.int64) + positions = positions[:padded_batch_size] + + # TOKEN_IDS. [batch, 1] + token_ids = torch.zeros((padded_batch_size, 1), dtype=torch.int32) + token_ids[:num_decodes] = torch.gather( + input=torch.from_numpy(self.input_batch.token_ids_cpu), + dim=1, + index=index, + )[:num_decodes] + + # SLOT_MAPPING [batch, 1] + # The "slot" is the "physical index" of a token in the KV cache. + # Look up the block_idx in the block table (logical<>physical map) + # to compute this. + block_number = torch.gather( + input=self.input_batch.block_table_cpu_tensor, + dim=1, + index=(index // self.block_size)) + # NOTE(kzawora): the "-1" is what causes this entire thing to work + # properly and have good accuracy - why? beats me... + block_offsets = (index - 1) % self.block_size + slot_mapping = block_number * self.block_size + block_offsets + # Set an out of range value for the padding tokens so that they + # are ignored when inserting into the KV cache. + slot_mapping[num_decodes:] = _PAD_SLOT_ID + slot_mapping = slot_mapping[:padded_batch_size] + + # BLOCK_TABLE [batch, max_num_blocks_per_req] + context_lens = self.input_batch.num_computed_tokens_cpu[:num_decodes] + num_blocks = np.ceil(context_lens / self.block_size).astype( + np.int32).tolist() + block_tables_list = [] + for i, n in enumerate(num_blocks): + block_tables_list.append( + self.input_batch.block_table_cpu_tensor[i, :n].tolist()) + + # CONTEXT_LENS [batch_size] + #context_lens = (positions.reshape(-1) + 1) + + block_list, block_groups, block_usage = \ + self.get_habana_paged_attn_buffers( + block_tables_list, slot_mapping.tolist(), bucketing) + + logits_indices = torch.zeros(padded_batch_size, + dtype=torch.int32, + device='cpu') + query_start_loc = torch.empty((num_decodes + 1, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + query_start_loc_np = query_start_loc.numpy() + query_start_loc_np[0] = 0 + np.cumsum(num_scheduled_tokens[:num_decodes], + out=query_start_loc_np[1:]) + logits_indices[:num_decodes] = query_start_loc[1:] - 1 + num_decode_tokens = torch.tensor(np.sum(context_lens), device='cpu') + + # CPU<>HPU sync *should not* happen here. + token_ids_device = _async_h2d_tensor_copy(token_ids, self.device) + positions_device = _async_h2d_tensor_copy(positions, self.device) + logits_indices_device = _async_h2d_tensor_copy(logits_indices, + self.device) + block_list_device = _async_h2d_tensor_copy(block_list, self.device) + block_usage_device = _async_h2d_tensor_copy(block_usage, self.device) + block_groups_device = _async_h2d_tensor_copy(block_groups, self.device) + num_decode_tokens_device = _async_h2d_tensor_copy( + num_decode_tokens, self.device) + slot_mapping_device = _async_h2d_tensor_copy(slot_mapping, self.device) + + return DecodeInputData( + num_decodes=num_decodes, + token_ids=token_ids_device, + position_ids=positions_device, + logits_indices=logits_indices_device, + attn_metadata=HPUAttentionMetadata.make_decode_metadata( + block_list=block_list_device, + block_usage=block_usage_device, + block_groups=block_groups_device, + num_decode_tokens=num_decode_tokens_device, + slot_mapping=slot_mapping_device, + )) + + def _prepare_inputs( + self, + scheduler_output: "SchedulerOutput", + bucketing=True + ) -> Tuple[PrefillInputData, Optional[DecodeInputData]]: + + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + + num_reqs = self.input_batch.num_reqs + num_decodes = self.input_batch.num_decodes + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = [] + for idx, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens.append(num_tokens) + + # NOTE: assert that all the decodes are "decodes". + if idx < num_decodes: + assert num_tokens == 1 + + return ( + self._prepare_prefill_inputs(num_scheduled_tokens, bucketing), + self._prepare_decode_inputs(num_scheduled_tokens, bucketing), + ) + + def _seq_len(self, attn_metadata): + return attn_metadata.slot_mapping.size(1) + + def _num_blocks(self, attn_metadata): + if attn_metadata.block_list is None: + return 0 + return attn_metadata.block_list.numel() + + def _phase(self, attn_metadata): + phase_type: PhaseType + is_prompt = attn_metadata.is_prompt + is_prefix_cached = is_prompt and attn_metadata.block_list is not None + if is_prompt and is_prefix_cached: + phase_type = PhaseType.CACHED_PREFILL + elif is_prompt and not is_prefix_cached: + phase_type = PhaseType.UNCACHED_PREFILL + elif not is_prompt: + phase_type = PhaseType.DECODE + else: + raise ValueError("Unrecognized pass type, likely due to malformed " + "attention metadata") + return phase_type + + def _check_config(self, batch_size, seq_len, num_blocks, attn_metadata, + warmup_mode): + phase = self._phase(attn_metadata) + cfg = (batch_size, seq_len, num_blocks, phase) + seen = cfg in self.seen_configs + self.seen_configs.add(cfg) + if not seen and not warmup_mode: + phase = phase.value + logger.warning( + "Configuration: (%s, %s, %s, %s) was not warmed-up!", phase, + batch_size, seq_len, num_blocks) + + def _execute_model_generic(self, + token_ids, + position_ids, + attn_metadata, + logits_indices, + kv_caches, + warmup_mode=False): + # FORWARD. + batch_size = token_ids.size(0) + seq_len = self._seq_len(attn_metadata) + num_blocks = self._num_blocks(attn_metadata) + + is_prompt = attn_metadata.is_prompt + self._check_config(batch_size, seq_len, num_blocks, attn_metadata, + warmup_mode) + additional_kwargs = {} + if htorch.utils.internal.is_lazy( + ) and not self.model_config.enforce_eager: + use_graphs = self._use_graphs(batch_size, seq_len, num_blocks, + is_prompt) + additional_kwargs.update({"bypass_hpu_graphs": not use_graphs}) + #import pdb; pdb.set_trace() + trimmed_attn_metadata = trim_attn_metadata(attn_metadata) + hidden_states = self.model.forward(input_ids=token_ids, + positions=position_ids, + attn_metadata=trimmed_attn_metadata, + kv_caches=kv_caches) + #hidden_states = hidden_states[:num_scheduled_tokens] + hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(hidden_states, None) + return logits + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + # NOTE(kzawora): Since scheduler doesn't differentiate between prefills + # and decodes, we must handle mixed batches. In _update_states we make + # sure that first self.input_batch.num_decodes requests are decodes, + # and remaining ones until the end are prefills. _update_states also + # handles changes in request cache based on scheduler outputs and + # previous iterations (e.g. keeping block tables and context lengths up + # to date, creating, pruning and updating request caches, + # and some more stuff) + + # If num_decodes == self.input_batch.num_reqs, then batch is all decode, and only a single decode forward pass will be executed in this method. # noqa + # If num_decodes == 0, then batch is all prefill, and only prefill forward passes will be executed in this method. # noqa + # If neither apply, then batch is mixed, and both prefill and decode forward passes will be executed in this method. # noqa + + # First, we will execute all decodes (if any) in a single batch, + # then we'll execute prefills in batches of up to max_prefill_batch_size elements. # noqa + # All shapes used in forward passes are bucketed appropriately to mitigate risk of graph recompilations. # noqa + + # We can do sampling directly after executing each forward pass (split_sampler=True), # noqa + # or execute all forward passes, join the results and execute it once (split_sampler=False). # noqa + # Everything is done asynchronously - the only sync point is the place + # where we copy the generated tokens back to the host. + + # Example: If a batch has 6 requests, 3 prefills and 3 decodes, the unprocessed sequences in batch will be laid as follows: # noqa + # [D0, D1, D2, P0, P1, P2] + # If we assume max_prefill_batch_size=2, and split_sampler=True the flow of this method will look as follows: # noqa + # prepare_inputs: bucket [D0, D1, D2] -> [D0, D1, D2, 0] (BS=4 bucket, 1 seq padding) # noqa + # prepare_inputs: bucket [P0, P1, P2] -> [P0, P1], [P2] (BS=2 + BS=1 bucket, no seqs padding) # noqa + # decode forward pass BS4 [D0, D1, D2, 0] + # decode compute_logits BS4 [D0, D1, D2, 0] + # decode sampler BS4 [D0, D1, D2, 0] -> [tokD0, tokD1, tokD2, 0] + # prefill[iter 0] forward pass BS2 [P0, P1] + # prefill[iter 0] compute_logits BS2 [P0, P1] + # prefill[iter 0] sampler BS2 [P0, P1] -> [tokP0, tokP1] + # prefill[iter 1] forward pass BS1 [P0, P1] + # prefill[iter 1] compute_logits BS1 [P0, P1] + # prefill[iter 1] sampler BS1 [P0, P1] -> [tokP2] + # prefill concat sampler results [tokP0, tokP1], [tokP2] -> [tokP0, tokP1, tokP2] # noqa + # Join the prefill and decode on device into [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2] # noqa + # Transfer [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2] to CPU + # On CPU, sanitize [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2] -> [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] # noqa + # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] + + # Example2: Same thing, but with max_prefill_batch_size=4: + # prepare_inputs: bucket [D0, D1, D2] -> [D0, D1, D2, 0] (BS=4 bucket, 1 seq padding) # noqa + # prepare_inputs: bucket [P0, P1, P2] -> [P0, P1, P2, 0] (BS=4 bucket, 1 seq padding) # noqa + # decode forward pass BS4 [D0, D1, D2, 0] + # decode compute_logits BS4 [D0, D1, D2, 0] + # decode sampler BS4 [D0, D1, D2, 0] -> [tokD0, tokD1, tokD2, 0] + # prefill[iter 0] forward pass BS4 [P0, P1, P2, 0] + # prefill[iter 0] compute_logits BS4 [P0, P1, P2, 0] + # prefill[iter 0] sampler BS4 [P0, P1, P2, 0] -> [tokP0, tokP1, tokP2, 0] # noqa + # Join the prefill and decode on device into [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] # noqa + # Transfer [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] to CPU + # On CPU, sanitize [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] -> [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] # noqa + # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] + + # Example2: Same thing, but max_prefill_batch_size=4 and split_sampler=False: # noqa + # prepare_inputs: bucket [D0, D1, D2] -> [D0, D1, D2, 0] (BS=4 bucket, 1 seq padding) # noqa + # prepare_inputs: bucket [P0, P1, P2] -> [P0, P1, P2, 0] (BS=4 bucket, 1 seq padding) # noqa + # decode forward pass BS4 [D0, D1, D2, 0] + # decode compute_logits BS4 [D0, D1, D2, 0] + # prefill[iter 0] forward pass BS4 [P0, P1, P2, 0] + # prefill[iter 0] compute_logits BS4 [P0, P1, P2, 0] + # Join the prefill and decode on device into [D0, D1, D2, 0, P0, P1, P2, 0] # noqa + # joint sampler BS8 [D0, D1, D2, 0, P0, P1, P2, 0] -> [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] # noqa + # Transfer [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] to CPU + # On CPU, sanitize [tokD0, tokD1, tokD2, 0, tokP0, tokP1, tokP2, 0] -> [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] # noqa + # Return [tokD0, tokD1, tokD2, tokP0, tokP1, tokP2] + + self._update_states(scheduler_output) + prefill_data, decode_data = self._prepare_inputs( + scheduler_output, bucketing=self.enable_bucketing) + num_reqs = self.input_batch.num_reqs + num_decodes = decode_data.num_decodes + num_prefills = num_reqs - num_decodes + num_padded_decodes = decode_data.token_ids.shape[ + 0] if num_decodes > 0 else 0 + + #FIXME(kzawora): Currently there's no handling of logprobs. Fix that + # later. + logprob_token_ids = None + logprobs = None + prefill_output_tokens = [] + decode_output_tokens = [] + + ######################### DECODES ######################### + # Decodes run as one single batch with [padded_decode_bs, 1] + if num_decodes > 0: + htorch.core.mark_step() + logits_device = self._execute_model_generic( + decode_data.token_ids, decode_data.position_ids, + decode_data.attn_metadata, decode_data.logits_indices, + self.kv_caches) + htorch.core.mark_step() + sampling_metadata = self._prepare_sampling( + scheduler_output, + start_idx=0, + end_idx=num_decodes, + pad_to=num_padded_decodes) + htorch.core.mark_step() + sampler_output = self.model.sample( + logits=logits_device, sampling_metadata=sampling_metadata) + # sampler now returns cpu list instead of device tensor - + # and i don't like it + decode_output_tokens = sampler_output.sampled_token_ids + htorch.core.mark_step() + + ######################### PREFILLS ######################### + # Prefills run with shape [padded_prefill_bs, padded_prefill_len] + if num_prefills > 0: + prefill_seq_offset_start = num_decodes + htorch.core.mark_step() + for idx, (req_id, prompt_len, token_ids, position_ids, + attn_metadata, + logits_indices) in enumerate(prefill_data.zipped()): + htorch.core.mark_step() + logits_device = self._execute_model_generic( + token_ids, position_ids, attn_metadata, logits_indices, + self.kv_caches) + htorch.core.mark_step() + num_curr_prefills = token_ids.shape[0] + prefill_seq_offset_end = prefill_seq_offset_start + \ + num_curr_prefills + if prefill_seq_offset_start == prefill_seq_offset_end: + import pdb + pdb.set_trace() + sampling_metadata = self._prepare_sampling( + scheduler_output, + start_idx=prefill_seq_offset_start, + end_idx=prefill_seq_offset_end, + pad_to=num_curr_prefills) + htorch.core.mark_step() + sampler_output = self.model.sample( + logits=logits_device, sampling_metadata=sampling_metadata) + sampled_token_ids_device = sampler_output.sampled_token_ids + htorch.core.mark_step() + prefill_seq_offset_end = prefill_seq_offset_start + prefill_output_tokens.extend(sampled_token_ids_device) + # sampler now returns cpu list instead of device tensor - + # and i don't like it + # prefill_output_device = torch.cat(prefill_output_list, dim=0) + htorch.core.mark_step() + + sampled_token_ids_list = [ + *decode_output_tokens, *prefill_output_tokens + ] + ######### UPDATE REQUEST STATE WITH GENERATED TOKENS ######### + for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + req_state = self.requests[req_id] + + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + token_id = sampled_token_ids_list[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids.append(token_id) + + ################## RETURN ################## + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids[:num_reqs], + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=sampled_token_ids_list, + logprob_token_ids_cpu=logprob_token_ids, + logprobs_cpu=logprobs, + ) + + if False: + for req_id in self.input_batch.req_ids[:num_reqs]: + req_idx = self.input_batch.req_id_to_index[req_id] + token_ids = self.input_batch.token_ids_cpu[req_idx] + #prompt = self._tokenizer.decode( + # token_ids[:self.input_batch. + # num_prompt_tokens_cpu[req_idx]]) + + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + req_state = self.requests[req_id] + generated = self._tokenizer.decode(req_state.output_token_ids) + phase = 'prefill' if req_idx >= decode_data.num_decodes \ + else 'decode' + logger.info( + f'[ENGINE_ITER {self._ENGINE_ITER}] REQ:{req_id} IDX:{req_idx} {phase} generated token: {self._tokenizer.decode(sampled_token_ids_cpu[req_idx])!r}, all generated so far: {generated!r}' # noqa + ) + self._ENGINE_ITER += 1 + #import pdb; pdb.set_trace() + return model_runner_output + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + with HabanaMemoryProfiler() as m: # noqa: SIM117 + self.model = get_model(vllm_config=self.vllm_config) + self.model = _maybe_wrap_in_hpu_graph( + self.model, + self.block_size, + dtype=self.model_config.dtype, + enforce_eager=self.model_config.enforce_eager) + self.model_memory_usage = m.consumed_device_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) + + def _use_graphs(self, batch_size, seq_len, num_blocks, phase): + if self.model_config.enforce_eager: + return False + if self.skip_warmup: + return True + return (batch_size, seq_len, num_blocks, phase) in self.graphed_buckets + + def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): + num_candidates = len(buckets) + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' + graphed = list(c[:2] for c in self.graphed_buckets + if c[2] == is_prompt) + if num_candidates == 0: + num_candidates = 1 + msg = (f'{phase} captured:{len(graphed)} ' + f'({100 * len(graphed) / num_candidates:.1f}%) ' + f'used_mem:{format_bytes(total_mem)} ' + f'buckets:{sorted(list(graphed))}') + logger.info(msg) + + def warmup_scenario(self, batch_size, seq_or_block, is_prompt, + kv_caches) -> None: + """Dummy warmup run for memory usage and graph compilation.""" + + query_seq_len = seq_or_block if is_prompt else 1 + input_ids = torch.zeros((batch_size, query_seq_len), + dtype=torch.int32, + device='cpu') + position_ids = torch.zeros((batch_size, query_seq_len), + dtype=torch.int32, + device='cpu') + slot_mapping = torch.zeros((batch_size, query_seq_len), + dtype=torch.int64, + device='cpu') + + input_ids_device = _async_h2d_tensor_copy(input_ids, self.device) + position_ids_device = _async_h2d_tensor_copy(position_ids, self.device) + slot_mapping_device = _async_h2d_tensor_copy(slot_mapping, self.device) + + if is_prompt: + seq_lens = torch.zeros((batch_size), + dtype=torch.int32, + device='cpu') + seq_lens.fill_(seq_or_block) + seq_lens_device = _async_h2d_tensor_copy(seq_lens, self.device) + attn_metadata = HPUAttentionMetadata.make_prefill_metadata( + seq_lens_tensor=seq_lens_device, + num_prefills=batch_size, + num_prefill_tokens=batch_size * seq_or_block, + slot_mapping=slot_mapping_device) + else: + block_tables = [ + x.tolist() + for x in np.array_split(np.arange(seq_or_block), batch_size) + ] + block_list, block_groups, block_usage = \ + self.get_habana_paged_attn_buffers( + block_tables=block_tables, + slot_mapping=slot_mapping, + bucketing=True) + block_list_device = _async_h2d_tensor_copy(block_list, self.device) + block_usage_device = _async_h2d_tensor_copy( + block_usage, self.device) + block_groups_device = _async_h2d_tensor_copy( + block_groups, self.device) + attn_metadata = HPUAttentionMetadata.make_decode_metadata( + block_list=block_list_device, + block_usage=block_usage_device, + block_groups=block_groups_device, + num_decode_tokens=batch_size, + slot_mapping=slot_mapping_device) + + logits_indices = torch.arange(0, batch_size, device='cpu') + logits_indices_device = _async_h2d_tensor_copy(logits_indices, + self.device) + # Dummy run. + htorch.core.mark_step() + logits = self._execute_model_generic(input_ids_device, + position_ids_device, + attn_metadata, + logits_indices_device, kv_caches, + True) + # TODO: do sampling on logits, warmup sampler and prefill joiner + htorch.core.mark_step() + temperature = torch.ones(batch_size, dtype=torch.float32, device='cpu') + top_p = torch.ones(batch_size, dtype=torch.float32, device='cpu') + top_k = torch.ones(batch_size, dtype=torch.float32, device='cpu') + temperature_device = _async_h2d_tensor_copy(temperature, self.device) + top_p_device = _async_h2d_tensor_copy(top_p, self.device) + top_k_device = _async_h2d_tensor_copy(top_k, self.device) + generators = {i: None + for i in range(batch_size) + } # NOTE(kzawora): idk what to set here + max_num_logprobs = 0 # NOTE(kzawora): idk what to set here + # NOTE(kzawora: do this in a smarter way) + return None + htorch.core.mark_step() + sampling_metadata = SamplingMetadata( + temperature=temperature_device, + all_greedy=False, # hacky + all_random=True, # hacky + top_p=top_p_device, + top_k=top_k_device, + no_top_p=True, + no_top_k=True, + generators=generators, + max_num_logprobs=max_num_logprobs, + ) + tokens_all_random = self.model.sample(logits, sampling_metadata) + htorch.core.mark_step() + sampling_metadata = SamplingMetadata( + temperature=temperature_device, + all_greedy=True, # hacky + all_random=False, # hacky + top_p=top_p_device, + top_k=top_k_device, + no_top_p=True, + no_top_k=True, + generators=generators, + max_num_logprobs=max_num_logprobs, + ) + tokens_all_greedy = self.model.sample(logits, sampling_metadata) + htorch.core.mark_step() + sampling_metadata = SamplingMetadata( + temperature=temperature_device, + all_greedy=False, # hacky + all_random=False, # hacky + top_p=top_p_device, + top_k=top_k_device, + no_top_p=True, + no_top_k=True, + generators=generators, + max_num_logprobs=max_num_logprobs, + ) + tokens_mixed = self.model.sample(logits, sampling_metadata) + htorch.core.mark_step() + return tokens_all_random, tokens_all_greedy, tokens_mixed + + def log_warmup(self, phase, i, max_i, batch_size, seq_len): + free_mem = format_bytes( + HabanaMemoryProfiler.current_free_device_memory()) + dim = "num_blocks" + if phase == "Prompt": + dim = "seq_len" + msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " + f"batch_size:{batch_size} " + f"{dim}:{seq_len} " + f"free_mem:{free_mem}") + logger.info(msg) + + def warmup_all_buckets(self, buckets, is_prompt, kv_caches): + for i, (batch_size, seq_len) in enumerate(reversed(buckets)): + self.log_warmup('Prompt' if is_prompt else 'Decode', i, + len(buckets), batch_size, seq_len) + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + torch.hpu.synchronize() + + def warmup_graphs(self, + strategy, + buckets, + is_prompt, + kv_caches, + available_mem, + starting_mem=0, + total_batch_seq=0.001): + total_mem = starting_mem + idx = 0 + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' + num_candidates = len(buckets) + ordering : Union[Callable[[Any], Tuple[Any, Any]], \ + Callable[[Any], Tuple[Any, Any, Any]]] + if strategy == 'min_tokens': + ordering = lambda b: (b[0] * b[1], b[1], b[0]) + elif strategy == 'max_bs': + ordering = lambda b: (-b[0], b[1]) + else: + raise NotImplementedError( + f'Unsupported graph allocation strategy: {strategy}') + buckets = list(sorted(buckets, key=ordering)) + captured_all = True + for idx, (batch_size, seq_len) in enumerate(buckets): + # Graph memory usage is proportional to seq dimension in a batch + batch_seq = batch_size * seq_len if is_prompt else batch_size + mem_estimate = batch_seq / total_batch_seq * total_mem + if mem_estimate >= available_mem: + captured_all = False + continue + graphed_bucket = (batch_size, seq_len, is_prompt) + if graphed_bucket in self.graphed_buckets: + continue + self.graphed_buckets.add(graphed_bucket) + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) + with HabanaMemoryProfiler() as mem_prof: + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + #TODO(kzawora): align_workers + used_mem = mem_prof.consumed_device_memory + available_mem -= used_mem + total_mem += used_mem + total_batch_seq += batch_seq + + return total_mem, total_batch_seq, captured_all + + @torch.inference_mode() + def warmup_model(self) -> None: + kv_caches = self.kv_caches + if profile := os.environ.get('VLLM_PT_PROFILE', None): + phase, bs, seq_len, graph = profile.split('_') + is_prompt = phase == 'prompt' + graphs = graph == 't' + if graphs: + self.graphed_buckets.add((int(bs), int(seq_len), is_prompt)) + self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, + True) + raise AssertionError("Finished profiling") + if self.skip_warmup: + logger.info("Skipping warmup...") + return + #self.profiler.start('internal', 'warmup') + max_blocks = kv_caches[0][0].size(0) + self.bucketing_ctx.generate_prompt_buckets() + self.bucketing_ctx.generate_decode_buckets(max_blocks) + + if not htorch.utils.internal.is_lazy() and not self.enforce_eager: + cache_size_limit = len(self.bucketing_ctx.prompt_buckets) + len( + self.bucketing_ctx.decode_buckets) + 1 + torch._dynamo.config.cache_size_limit = max( + cache_size_limit, torch._dynamo.config.cache_size_limit) + # Multiply by 8 to follow the original default ratio between + # the cache_size_limit and accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = max( + cache_size_limit * 8, + torch._dynamo.config.accumulated_cache_size_limit) + + start_mem = HabanaMemoryProfiler.current_device_memory_usage() + start_time = time.perf_counter() + + compile_only_mode_context = functools.partial(bc.env_setting, + "PT_COMPILE_ONLY_MODE", + True) + can_use_compile_only_mode = True + try: + with compile_only_mode_context(): + pass + logger.debug("Using PT_COMPILE_ONLY_MODE.") + except KeyError: + can_use_compile_only_mode = False + logger.warning('Cannot use PT_COMPILE_ONLY_MODE. ' + 'Warmup time will be negatively impacted. ' + 'Please update Gaudi Software Suite.') + with compile_only_mode_context( + ) if can_use_compile_only_mode else contextlib.nullcontext(): + self.warmup_all_buckets(self.bucketing_ctx.prompt_buckets, True, + kv_caches) + self.warmup_all_buckets(self.bucketing_ctx.decode_buckets, False, + kv_caches) + + if (not self.model_config.enforce_eager + and htorch.utils.internal.is_lazy()): + assert self.mem_margin is not None, \ + ("HabanaWorker.determine_num_available_blocks needs " + "to be called before warming up the model.") + free_mem = HabanaMemoryProfiler.current_free_device_memory() + graph_free_mem = free_mem - self.mem_margin + #TODO(kzawora): align_workers + graph_free_mem = graph_free_mem + prompt_graph_mem_ratio = float( + os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3')) + prompt_available_memory = (prompt_graph_mem_ratio * + graph_free_mem) + decode_available_memory = (graph_free_mem - + prompt_available_memory) + msg = ( + f"Using {format_bytes(graph_free_mem)}" + f"/{format_bytes(free_mem)} " + "of free device memory for HPUGraphs, " + f"{format_bytes(prompt_available_memory)} for prompt and " + f"{format_bytes(decode_available_memory)} for decode " + f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") + logger.info(msg) + prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', + 'min_tokens') + decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', + 'max_bs') + mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ + self.warmup_graphs( + prompt_strategy, self.bucketing_ctx.prompt_buckets, + True, kv_caches, prompt_available_memory) + mem_post_decode, decode_batch_seq, decode_captured_all = \ + self.warmup_graphs( + decode_strategy, self.bucketing_ctx.decode_buckets, + False, kv_caches, decode_available_memory) + + # Not all prompt buckets were captured, but all decode buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more prompt buckets. + if (mem_post_decode + mem_post_prompt < graph_free_mem + and not prompt_captured_all and decode_captured_all): + mem_post_prompt, _, prompt_captured_all = ( + self.warmup_graphs( + prompt_strategy, self.bucketing_ctx.prompt_buckets, + True, kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_prompt, prompt_batch_seq)) + + # Not all decode buckets were captured, but all prompt buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more decode buckets. + if mem_post_decode + mem_post_prompt < graph_free_mem \ + and not decode_captured_all \ + and prompt_captured_all: + mem_post_decode, _, _ = self.warmup_graphs( + decode_strategy, self.bucketing_ctx.decode_buckets, + False, kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_decode, decode_batch_seq) + + self.log_graph_warmup_summary( + self.bucketing_ctx.prompt_buckets, True, mem_post_prompt) + self.log_graph_warmup_summary( + self.bucketing_ctx.decode_buckets, False, mem_post_decode) + + end_time = time.perf_counter() + end_mem = HabanaMemoryProfiler.current_device_memory_usage() + elapsed_time = end_time - start_time + msg = ( + f"Warmup finished in {elapsed_time:.0f} secs, " + f"allocated {format_bytes(end_mem - start_mem)} of device memory") + logger.info(msg) + #self.profiler.end() + + @torch.inference_mode() + def profile_run(self) -> None: + return + """Profile to measure peak memory during forward pass.""" + + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value `None`. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + + # Run empty prefill forwards - prefill max batch and prefill max seq + self.warmup_scenario(batch_size=1, + seq_or_block=self.max_model_len, + is_prompt=True, + kv_caches=kv_caches) + max_seq_len = math.ceil( + (self.max_num_tokens // self.max_prefill_batch_size) / + self.block_size) * self.block_size + self.warmup_scenario(batch_size=self.max_prefill_batch_size, + seq_or_block=max_seq_len, + is_prompt=True, + kv_caches=kv_caches) + + @torch.inference_mode() + def capture_model(self) -> None: + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + with set_forward_context(None): + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + for num_tokens in reversed(self.cudagraph_batch_sizes): + self.model( + self.input_ids[:num_tokens], + self.positions[:num_tokens], + kv_caches=self.kv_caches, + attn_metadata=None, + ) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / (1 << 30)) + + def initialize_kv_cache(self, num_blocks: int) -> None: + assert len(self.kv_caches) == 0 + kv_cache_shape = HPUAttentionBackendV1.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size) + self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = [] + dtype = self.dtype + if self.device != 'hpu' and not is_fake_hpu() \ + and self.dtype == torch.float8_e4m3fn: + dtype = torch.uint8 + for _ in range(self.num_attn_layers): + key_cache = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + value_cache = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + kv_layer = (key_cache, value_cache) + self.kv_caches.append(kv_layer) + if self.enable_bucketing: + self.bucketing_ctx.num_hpu_blocks = num_blocks + htorch.hpu.synchronize() + + +@dataclass +class CachedRequestState: + + req_id: str + prompt_token_ids: List[int] + prompt: Optional[str] + mm_inputs: List[MultiModalKwargs] + mm_positions: List["PlaceholderRange"] + sampling_params: SamplingParams + generator: Optional[torch.Generator] + + block_ids: List[int] + num_computed_tokens: int + output_token_ids: List[int] + + @property + def num_tokens(self) -> int: + return len(self.prompt_token_ids) + len(self.output_token_ids) + + +class InputBatch: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_blocks_per_req: int, + device: torch.device, + pin_memory: bool, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req + self.device = device + self.pin_memory = pin_memory + + self.req_ids: List[Optional[str]] = [None] * max_num_reqs + self.req_id_to_index: Dict[str, int] = {} + + self.token_ids_cpu = np.empty((max_num_reqs, max_model_len), + dtype=np.int32) + self.num_computed_tokens_cpu = np.zeros(max_num_reqs, dtype=np.int32) + self.num_output_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + self.num_prompt_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + + # Attention-related. + + # NOTE(kzawora): "+1" here prevents us from going OoB in block table + # when max model length is reached. + # Sometimes scheduler allocates two blocks ahead which can go out of + # valid seq len bounds, so e.g. if we have 16 blocks available, and + # we've just filled entirety of 15th block, sometimes scheduler assigns + # 16th and 17th block to the sequence, even though it can never + # reach block 17. I have no idea why that happens, but + # it smells like a bug. + self.block_table = torch.zeros( + (max_num_reqs, max_num_blocks_per_req + 1), + device=self.device, + dtype=torch.int32) + self.block_table_cpu_tensor = torch.zeros( + (max_num_reqs, max_num_blocks_per_req + 1), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_cpu = self.block_table_cpu_tensor.numpy() + + # Sampling-related. + self.temperature = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.temperature_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.temperature_cpu = self.temperature_cpu_tensor.numpy() + self.greedy_reqs: Set[str] = set() + self.random_reqs: Set[str] = set() + + self.top_p = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.top_p_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.top_p_cpu = self.top_p_cpu_tensor.numpy() + self.top_p_reqs: Set[str] = set() + + self.top_k = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device=device) + self.top_k_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.top_k_cpu = self.top_k_cpu_tensor.numpy() + self.top_k_reqs: Set[str] = set() + + # req_index -> generator + self.generators: Dict[int, torch.Generator] = {} + + self.num_logprobs: Dict[str, int] = {} + self.prompt_logprob_reqs: Set[str] = set() + + self.num_prefills = 0 + + def add_request( + self, + request: "CachedRequestState", + req_index: Optional[int] = None, + ) -> None: + if req_index is None: + req_index = self.num_reqs + assert req_index < self.max_num_reqs + + req_id = request.req_id + self.req_ids[req_index] = req_id + self.req_id_to_index[req_id] = req_index + + # Copy the prompt token ids and output token ids. + num_prompt_tokens = len(request.prompt_token_ids) + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + start_idx = num_prompt_tokens + end_idx = start_idx + len(request.output_token_ids) + self.token_ids_cpu[req_index, + start_idx:end_idx] = request.output_token_ids + + self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens + #self.num_output_tokens_cpu[req_index] = request.num_output_tokens + self.num_prompt_tokens_cpu[req_index] = len(request.prompt_token_ids) + num_blocks = len(request.block_ids) + self.block_table_cpu[req_index, :num_blocks] = request.block_ids + + sampling_params = request.sampling_params + self.temperature_cpu[req_index] = sampling_params.temperature + if sampling_params.sampling_type == SamplingType.GREEDY: + self.greedy_reqs.add(req_id) + else: + self.random_reqs.add(req_id) + + self.top_p_cpu[req_index] = sampling_params.top_p + if sampling_params.top_p < 1: + self.top_p_reqs.add(req_id) + self.top_k_cpu[req_index] = sampling_params.top_k + if sampling_params.top_k > 0: + self.top_k_reqs.add(req_id) + + self.generators[req_index] = request.generator + + num_logprobs = sampling_params.logprobs + if num_logprobs is not None and num_logprobs > 0: + self.num_logprobs[req_id] = num_logprobs + if sampling_params.prompt_logprobs: + self.prompt_logprob_reqs.add(req_id) + + def remove_request(self, req_id: str) -> Optional[int]: + req_index = self.req_id_to_index.pop(req_id, None) + if req_index is None: + return None + self.req_ids[req_index] = None + + self.greedy_reqs.discard(req_id) + self.random_reqs.discard(req_id) + self.top_p_reqs.discard(req_id) + self.top_k_reqs.discard(req_id) + self.generators.pop(req_index, None) + self.num_logprobs.pop(req_id, None) + self.prompt_logprob_reqs.discard(req_id) + return req_index + + def clear(self) -> None: + self.req_ids = [None] * self.max_num_reqs + self.req_id_to_index.clear() + self.greedy_reqs.clear() + self.random_reqs.clear() + self.top_p_reqs.clear() + self.top_k_reqs.clear() + self.generators.clear() + self.num_logprobs.clear() + self.prompt_logprob_reqs.clear() + + def condense(self, empty_req_indices: List[int]) -> None: + if self.num_reqs == 0: + # The batched states are empty. + return + + # NOTE(woosuk): This function assumes that the empty_req_indices + # is sorted in descending order. + last_req_index = self.num_reqs + len(empty_req_indices) - 1 + while empty_req_indices: + # Find the largest non-empty index. + while last_req_index in empty_req_indices: + last_req_index -= 1 + + # Find the smallest empty index. + empty_index = empty_req_indices.pop() + if empty_index >= last_req_index: + break + + # Swap the states. + req_id = self.req_ids[last_req_index] + self.req_ids[empty_index] = req_id + self.req_ids[last_req_index] = None + self.req_id_to_index[req_id] = empty_index + + # TODO(woosuk): Optimize the copy of token_ids_cpu and + # block_table_cpu. + self.token_ids_cpu[empty_index] = self.token_ids_cpu[ + last_req_index] + self.num_computed_tokens_cpu[ + empty_index] = self.num_computed_tokens_cpu[last_req_index] + self.block_table_cpu[empty_index] = self.block_table_cpu[ + last_req_index] + self.temperature_cpu[empty_index] = self.temperature_cpu[ + last_req_index] + self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] + self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] + generator = self.generators.pop(last_req_index, None) + if generator is not None: + self.generators[empty_index] = generator + + # Decrement last_req_index since it is now empty. + last_req_index -= 1 + + def make_sampling_metadata(self, + skip_copy, + start_idx: Optional[int] = None, + end_idx: Optional[int] = None, + pad_to: Optional[int] = None): + if start_idx is None and end_idx is None and pad_to is None: + return self._make_sampling_metadata_all(skip_copy=skip_copy) + return self._make_sampling_metadata_range(skip_copy, + start_idx, + end_idx, + pad_to=pad_to) + + def _make_sampling_metadata_range( + self, + skip_copy: bool = False, + start_idx: Optional[int] = None, + end_idx: Optional[int] = None, + pad_to: Optional[int] = None) -> SamplingMetadata: + if start_idx is None: + start_idx = 0 + if end_idx is None: + end_idx = self.num_reqs + max_num_reqs = len(self.req_ids) + end_idx = min(end_idx, max_num_reqs) + num_seqs = end_idx - start_idx + padding_needed = max(0, pad_to - num_seqs) + req_ids = self.req_ids[start_idx:end_idx] + if not skip_copy: + self.temperature[start_idx:end_idx].copy_( + self.temperature_cpu_tensor[start_idx:end_idx], + non_blocking=True) + self.top_p[start_idx:end_idx].copy_( + self.top_p_cpu_tensor[start_idx:end_idx], non_blocking=True) + self.top_k[start_idx:end_idx].copy_( + self.top_k_cpu_tensor[start_idx:end_idx], non_blocking=True) + + all_greedy = all([req_id in self.greedy_reqs for req_id in req_ids]) + all_random = all([req_id in self.random_reqs for req_id in req_ids]) + if all_greedy and all_random: + import pdb + pdb.set_trace() #WTF?! + no_top_p = not any([req_id in self.top_p_reqs for req_id in req_ids]) + no_top_k = not any([req_id in self.top_k_reqs for req_id in req_ids]) + # NOTE(kzawora): Generators are used by sampler row-wise. If we got a + # generator for element 5, but it's first row in a batch, + # we need to assign that generator to index 0 - hence the + # i:generators.get(req_id) rather than req_id:generators.get(req_id) + generators = { + i: self.generators.get(req_id, None) + for i, req_id in enumerate( + range(start_idx, end_idx + padding_needed)) + } + temperature_device = self.temperature[start_idx:end_idx + + padding_needed] + top_p_device = self.top_p[start_idx:end_idx + padding_needed] + tok_k_device = self.top_k[start_idx:end_idx + padding_needed] + if end_idx + padding_needed >= max_num_reqs: + # NOTE(kzawora): it's janky, but [start_idx:end_idx+padding_needed] + # falls apart once your padding exceeds max_num_reqs (and it + # happens pretty often, you could increase the + # temperature/topp/topk allocation, but you cannot really make + # any guarantees ahead of time on the amount of padding you'll use) + # this is kind of a temporary fix, no idea on its performance impact + temperature_device = torch.empty(pad_to, + device=self.temperature.device, + dtype=self.temperature.dtype) + top_p_device = torch.empty(pad_to, + device=self.top_p.device, + dtype=self.top_p.dtype) + top_k_device = torch.empty(pad_to, + device=self.top_k.device, + dtype=self.top_k.dtype) + # D2D copy + temperature_device[:num_seqs].copy_( + self.temperature[start_idx:end_idx], non_blocking=True) + top_p_device[:num_seqs].copy_(self.top_p[start_idx:end_idx], + non_blocking=True) + top_k_device[:num_seqs].copy_(self.top_k[start_idx:end_idx], + non_blocking=True) + + return SamplingMetadata( + temperature=temperature_device, + all_greedy=all_greedy, + all_random=all_random, + top_p=top_p_device, + top_k=tok_k_device, + no_top_p=no_top_p, + no_top_k=no_top_k, + generators=generators, + max_num_logprobs=self.max_num_logprobs, + ) + + def _make_sampling_metadata_all( + self, + skip_copy: bool = False, + ) -> SamplingMetadata: + if not skip_copy: + self.temperature[:self.num_reqs].copy_( + self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_p[:self.num_reqs].copy_( + self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_k[:self.num_reqs].copy_( + self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True) + return SamplingMetadata( + temperature=self.temperature[:self.num_reqs], + all_greedy=self.all_greedy, + all_random=self.all_random, + top_p=self.top_p[:self.num_reqs], + top_k=self.top_k[:self.num_reqs], + no_top_p=self.no_top_p, + no_top_k=self.no_top_k, + generators=self.generators, + max_num_logprobs=self.max_num_logprobs, + ) + + @property + def num_reqs(self) -> int: + return len(self.req_id_to_index) + + @property + def num_decodes(self) -> int: + return self.num_reqs - self.num_prefills + + @property + def all_greedy(self) -> bool: + return len(self.random_reqs) == 0 + + @property + def all_random(self) -> bool: + return len(self.greedy_reqs) == 0 + + @property + def no_top_p(self) -> bool: + return len(self.top_p_reqs) == 0 + + @property + def no_top_k(self) -> bool: + return len(self.top_k_reqs) == 0 + + @property + def max_num_logprobs(self) -> int: + return max(self.num_logprobs.values()) if self.num_logprobs else 0 + + @property + def no_logprob(self) -> bool: + return len(self.num_logprobs) == 0 + + @property + def no_prompt_logprob(self) -> bool: + return len(self.prompt_logprob_reqs) == 0 diff --git a/vllm/v1/worker/hpu_worker.py b/vllm/v1/worker/hpu_worker.py new file mode 100644 index 0000000000000..23fa1361c9fdc --- /dev/null +++ b/vllm/v1/worker/hpu_worker.py @@ -0,0 +1,244 @@ +"""A GPU worker class.""" +import gc +import os +from contextlib import contextmanager +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import torch.distributed +from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes + +import vllm.envs as envs +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.hpu_model_runner import HPUModelRunner + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + + +class HPUWorker: + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + ): + + # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config) + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.HPU, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None + + def initialize(self): + # Initialize the distributed environment. + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + self.model_runner = HPUModelRunner(self.vllm_config) + + def load_model(self) -> None: + self.model_runner.load_model() + + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + if is_fake_hpu(): + cache_block_size = _get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + fake_hpu_cache_alloc = 4 * 2**30 # take 4 GiB flat on fake hpu + return fake_hpu_cache_alloc // cache_block_size, 0 + with HabanaMemoryProfiler() as m: + self.model_runner.profile_run() + torch.hpu.synchronize() + msg = ("Model profiling run " + f"took {m.get_summary_string()}") + logger.info(msg) + # At this point we should've allocated the maximum workspace for all + # recipes we will use the extra memory for graphs/blocks + free_hpu_memory = torch.hpu.mem_get_info()[0] + + cache_block_size = _get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + graph_reserved_mem = (float( + os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1')) + if not self.model_config.enforce_eager else 0) + graph_headroom = 1 - graph_reserved_mem + available_hpu_memory = free_hpu_memory * \ + self.cache_config.gpu_memory_utilization + hpu_memory_margin = free_hpu_memory * ( + 1 - self.cache_config.gpu_memory_utilization) + self.model_runner.mem_margin = hpu_memory_margin + cache_size_bytes = available_hpu_memory * graph_headroom + graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom) + msg = ( + f"Free device memory: {format_bytes(free_hpu_memory)}, " + f"{format_bytes(available_hpu_memory)} usable " + f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization})," + f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs " + f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), " + f"{format_bytes(cache_size_bytes)} reserved for KV cache") + logger.info(msg) + num_hpu_blocks = int(cache_size_bytes // cache_block_size) + num_hpu_blocks = max(num_hpu_blocks, 0) + + gc.collect() + return num_hpu_blocks, 0 + + def initialize_cache(self, num_gpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks.""" + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + + max_seq_len = self.cache_config.block_size * num_gpu_blocks + max_model_len = self.model_config.max_model_len + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") + + with HabanaMemoryProfiler() as m: + self.model_runner.initialize_kv_cache(num_gpu_blocks) + torch.hpu.synchronize() + msg = ("Initializing cache engine " + f"took {m.get_summary_string()}") + logger.info(msg) + self.compile_or_warm_up_model() + + def compile_or_warm_up_model(self) -> None: + if not self.model_config.enforce_eager: + self.model_runner.warmup_model() + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + #with track_graph_compile('HPUWorker.execute_model'): + output = self.model_runner.execute_model(scheduler_output) + # TODO(woosuk): Send the output to the engine process. + return output if self.rank == 0 else None + return output + + +def init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, +) -> None: + """Initialize the distributed environment.""" + init_distributed_environment(parallel_config.world_size, + rank, + distributed_init_method, + local_rank, + backend='hccl') + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + dummy_tensor_hpu = torch.ones(1).to('hpu') + torch.distributed.all_reduce(dummy_tensor_hpu) + assert dummy_tensor_hpu.item() == parallel_config.world_size + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + +def _get_cache_block_size( + cache_config: CacheConfig, + model_config: ModelConfig, + parallel_config: ParallelConfig, +) -> int: + head_size = model_config.get_head_size() + num_heads = model_config.get_num_kv_heads(parallel_config) + num_attention_layers = model_config.get_num_attention_layers( + parallel_config) + + key_cache_block = cache_config.block_size * num_heads * head_size + value_cache_block = key_cache_block + total = num_attention_layers * (key_cache_block + value_cache_block) + if cache_config.cache_dtype == "auto": + dtype = model_config.dtype + else: + dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + dtype_size = get_dtype_size(dtype) + return dtype_size * total + + +@contextmanager +def track_graph_compile(name: str): + import habana_frameworks.torch as htorch + from habana_frameworks.torch.hpu.metrics import metric_localcontext + with metric_localcontext("graph_compilation") as gc: + yield + htorch.hpu.synchronize() + if gc.stats()[0][1] != 0: + msg = f"[{name}] graph compilation detected: {gc.stats()}" + logger.warning(msg)