Skip to content

Commit

Permalink
Set dynamo cache size for torch compile
Browse files Browse the repository at this point in the history
Adopt setting dynamo cache size to current behavior where number of
graphs in compilation in one forward path is equal:
number of LlamaDecoderLayer's + 2 (RMSNorm, VocabParallelEmbedding)
  • Loading branch information
anko-intel committed Jan 23, 2025
1 parent 1a87bc5 commit f78b021
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions vllm/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,11 @@ def __init__(
self.model_config.is_attention_free,
) if needs_attn_backend else None

torch._dynamo.config.cache_size_limit = \
2 + self.model_config.get_num_layers(self.parallel_config)
torch._dynamo.config.accumulated_cache_size_limit = \
8 * torch._dynamo.config.cache_size_limit

# Multi-modal data support
self.input_registry = input_registry
self.mm_registry = mm_registry
Expand Down Expand Up @@ -1721,8 +1726,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
self.bucketing_ctx.generate_prompt_buckets()
self.bucketing_ctx.generate_decode_buckets(max_blocks)
if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
multiplier = 3 if os.getenv('VLLM_REGIONAL_COMPILATION',
'true').lower() == 'true' else 1
multiplier = 2 + self.model_config.get_num_layers(
self.parallel_config) if os.getenv(
'VLLM_REGIONAL_COMPILATION',
'true').lower() == 'true' else 1
cache_size_limit = 1 + multiplier * (
len(self.bucketing_ctx.prompt_buckets) +
len(self.bucketing_ctx.decode_buckets))
Expand Down

0 comments on commit f78b021

Please sign in to comment.