Set dynamo cache size for torch compile

Adopt setting dynamo cache size to current behavior where number of graphs in compilation in one forward path is equal: number of LlamaDecoderLayer's + 2 (RMSNorm, VocabParallelEmbedding)
HabanaAI · Jan 23, 2025 · f78b021 · f78b021
1 parent 1a87bc5
commit f78b021
Showing 1 changed file with 9 additions and 2 deletions.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -636,6 +636,11 @@ def __init__(
             self.model_config.is_attention_free,
         ) if needs_attn_backend else None
 
+        torch._dynamo.config.cache_size_limit = \
+            2 + self.model_config.get_num_layers(self.parallel_config)
+        torch._dynamo.config.accumulated_cache_size_limit = \
+            8 * torch._dynamo.config.cache_size_limit
+
         # Multi-modal data support
         self.input_registry = input_registry
         self.mm_registry = mm_registry
@@ -1721,8 +1726,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.bucketing_ctx.generate_prompt_buckets()
         self.bucketing_ctx.generate_decode_buckets(max_blocks)
         if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
-            multiplier = 3 if os.getenv('VLLM_REGIONAL_COMPILATION',
-                                        'true').lower() == 'true' else 1
+            multiplier = 2 + self.model_config.get_num_layers(
+                self.parallel_config) if os.getenv(
+                    'VLLM_REGIONAL_COMPILATION',
+                    'true').lower() == 'true' else 1
             cache_size_limit = 1 + multiplier * (
                 len(self.bucketing_ctx.prompt_buckets) +
                 len(self.bucketing_ctx.decode_buckets))