From cef2f54b1d369195b5485161f9af941caa11d734 Mon Sep 17 00:00:00 2001 From: Zehao Huang Date: Wed, 25 Sep 2024 20:28:48 +0800 Subject: [PATCH] Setting enough cache_size_limit for torch.compile warmup (#238) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the issue that warmup sometimes doesn't work because the default cache_size_limit is only 8 . --------- Signed-off-by: zehao-intel Co-authored-by: Andrzej Kotłowski --- vllm/worker/habana_model_runner.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6940e7637dbb7..394bb5318d10e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1553,6 +1553,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: len(self.decode_buckets), list(sorted(self.decode_buckets))) + if not htorch.utils.internal.is_lazy() and not self.enforce_eager: + cache_size_limit = len(self.prompt_buckets) + len( + self.decode_buckets) + 1 + torch._dynamo.config.cache_size_limit = max( + cache_size_limit, torch._dynamo.config.cache_size_limit) + # Multiply by 8 to follow the original default ratio between + # the cache_size_limit and accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = max( + cache_size_limit * 8, + torch._dynamo.config.accumulated_cache_size_limit) + start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter()