From cef2f54b1d369195b5485161f9af941caa11d734 Mon Sep 17 00:00:00 2001
From: Zehao Huang <zehao.huang@intel.com>
Date: Wed, 25 Sep 2024 20:28:48 +0800
Subject: [PATCH] Setting enough cache_size_limit for torch.compile warmup
 (#238)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the issue that warmup sometimes doesn't work because the default
cache_size_limit is only 8 .

---------

Signed-off-by: zehao-intel <zehao.huang@intel.com>
Co-authored-by: Andrzej Kotłowski <Andrzej.Kotlowski@intel.com>
---
 vllm/worker/habana_model_runner.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 6940e7637dbb7..394bb5318d10e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1553,6 +1553,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     len(self.decode_buckets),
                     list(sorted(self.decode_buckets)))
 
+        if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
+            cache_size_limit = len(self.prompt_buckets) + len(
+                self.decode_buckets) + 1
+            torch._dynamo.config.cache_size_limit = max(
+                cache_size_limit, torch._dynamo.config.cache_size_limit)
+            # Multiply by 8 to follow the original default ratio between
+            # the cache_size_limit and accumulated_cache_size_limit
+            torch._dynamo.config.accumulated_cache_size_limit = max(
+                cache_size_limit * 8,
+                torch._dynamo.config.accumulated_cache_size_limit)
+
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()