From 0467cc14b5cef8aebd6559c79429d75481266220 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Thu, 14 Nov 2024 11:36:05 +0200 Subject: [PATCH 1/8] Warm up random sampler --- benchmarks/benchmark_throughput.py | 2 ++ vllm/worker/hpu_model_runner.py | 27 ++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5f8c686b88fe4..2668949e6a1f1 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -151,7 +151,9 @@ def run_vllm( if not use_beam_search: start = time.perf_counter() + #llm.start_profile() llm.generate(prompts, sampling_params, use_tqdm=True) + #llm.stop_profile() end = time.perf_counter() else: prompts = [request.prompt for request in requests] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 97ad0a6893dd4..0be643018e41c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -646,6 +646,8 @@ def __init__( # For multi-step scheduling self.cached_step_outputs: List[torch.Tensor] = [] + self.warmed_sampler_bs = [] + def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold # for comprehensive description of gc generations. @@ -1215,7 +1217,7 @@ def prepare_input_tensors( seq_group_metadata_list = seq_group_metadata_list.copy() if batch_size_padding > 0: dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( - 0, 0, is_prompt) + 0, 0, is_prompt, temperature=0) seq_group_metadata_list.extend(dummy_seq_group_metadata for _ in range(batch_size_padding)) @@ -1388,8 +1390,9 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt, - lora_request=None): - sampling_params = SamplingParams(temperature=0) + lora_request=None, + temperature=0): + sampling_params = SamplingParams(temperature=temperature) num_blocks = math.ceil(seq_len / self.block_size) seq_len = max(seq_len, 1) if is_prompt: @@ -1429,7 +1432,8 @@ def warmup_scenario(self, is_prompt, kv_caches, is_pt_profiler_run=False, - is_lora_profile_run=False) -> None: + is_lora_profile_run=False, + temperature=0) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" @@ -1468,7 +1472,8 @@ def warmup_scenario(self, seq_len, is_prompt, lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) + if dummy_lora_requests_per_seq else None, + temperature=temperature) for i in range(batch_size) ] else: @@ -1481,7 +1486,8 @@ def warmup_scenario(self, b * self.block_size - 1, is_prompt, lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) + if dummy_lora_requests_per_seq else None, + temperature=temperature) for i, b in enumerate(blocks) ] torch.hpu.synchronize() @@ -1567,7 +1573,14 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): for i, (batch_size, seq_len) in enumerate(reversed(buckets)): self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, + temperature=0) + + # Warm up random sampler once per batch size + if batch_size not in self.warmed_sampler_bs: + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, + temperature=1.0) + self.warmed_sampler_bs.append(batch_size) def warmup_graphs(self, strategy, From 82e0521975a014e2475b02e0f94ab77410833019 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 14:56:07 +0200 Subject: [PATCH 2/8] Warmup random sampler only during decoding --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 0be643018e41c..02d907e7f765c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1577,7 +1577,7 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): temperature=0) # Warm up random sampler once per batch size - if batch_size not in self.warmed_sampler_bs: + if batch_size not in self.warmed_sampler_bs and not is_prompt: self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, temperature=1.0) self.warmed_sampler_bs.append(batch_size) From 0014d34d2d42de41f8960be3d3e15838081f426c Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 16:05:04 +0200 Subject: [PATCH 3/8] Remove comment --- vllm/worker/hpu_model_runner.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 02d907e7f765c..62df4c2a1e598 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1573,10 +1573,8 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): for i, (batch_size, seq_len) in enumerate(reversed(buckets)): self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, - temperature=0) + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - # Warm up random sampler once per batch size if batch_size not in self.warmed_sampler_bs and not is_prompt: self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, temperature=1.0) From e0e37e0ea06ff1c2cfc0234eec507a5c93176ada Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 16:15:25 +0200 Subject: [PATCH 4/8] Remove comments --- benchmarks/benchmark_throughput.py | 2 -- vllm/worker/hpu_model_runner.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 2668949e6a1f1..5f8c686b88fe4 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -151,9 +151,7 @@ def run_vllm( if not use_beam_search: start = time.perf_counter() - #llm.start_profile() llm.generate(prompts, sampling_params, use_tqdm=True) - #llm.stop_profile() end = time.perf_counter() else: prompts = [request.prompt for request in requests] diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 62df4c2a1e598..8c772ee70710c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1217,7 +1217,7 @@ def prepare_input_tensors( seq_group_metadata_list = seq_group_metadata_list.copy() if batch_size_padding > 0: dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( - 0, 0, is_prompt, temperature=0) + 0, 0, is_prompt) seq_group_metadata_list.extend(dummy_seq_group_metadata for _ in range(batch_size_padding)) From 0175fe0c1dab3bdae059a694ff0a767261f574b8 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 16:44:47 +0200 Subject: [PATCH 5/8] Formatting --- vllm/worker/hpu_model_runner.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 8c772ee70710c..660ce3097d1b1 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -646,8 +646,6 @@ def __init__( # For multi-step scheduling self.cached_step_outputs: List[torch.Tensor] = [] - self.warmed_sampler_bs = [] - def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold # for comprehensive description of gc generations. @@ -795,6 +793,7 @@ def _setup_buckets(self) -> None: max=max(self.block_size, self.max_num_seqs * max_decode_seq // self.block_size)) self.graphed_buckets: Set[Any] = set() + self.warmed_sampler_bs: List[int] = [] msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " @@ -1473,8 +1472,7 @@ def warmup_scenario(self, is_prompt, lora_request=dummy_lora_requests_per_seq[i] if dummy_lora_requests_per_seq else None, - temperature=temperature) - for i in range(batch_size) + temperature=temperature) for i in range(batch_size) ] else: # FIXME: seq_len is actually number of blocks @@ -1487,8 +1485,7 @@ def warmup_scenario(self, is_prompt, lora_request=dummy_lora_requests_per_seq[i] if dummy_lora_requests_per_seq else None, - temperature=temperature) - for i, b in enumerate(blocks) + temperature=temperature) for i, b in enumerate(blocks) ] torch.hpu.synchronize() profiler = None @@ -1574,10 +1571,14 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - + + # Random sampler warmup if batch_size not in self.warmed_sampler_bs and not is_prompt: - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches, - temperature=1.0) + self.warmup_scenario(batch_size, + seq_len, + is_prompt, + kv_caches, + temperature=1.0) self.warmed_sampler_bs.append(batch_size) def warmup_graphs(self, From b38b160f2515243496c9db93d0509b13d703e39a Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 17:16:57 +0200 Subject: [PATCH 6/8] Move the warmup to graph capture function --- vllm/worker/hpu_model_runner.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 660ce3097d1b1..2063be60ca358 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -793,7 +793,6 @@ def _setup_buckets(self) -> None: max=max(self.block_size, self.max_num_seqs * max_decode_seq // self.block_size)) self.graphed_buckets: Set[Any] = set() - self.warmed_sampler_bs: List[int] = [] msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " @@ -1572,15 +1571,6 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - # Random sampler warmup - if batch_size not in self.warmed_sampler_bs and not is_prompt: - self.warmup_scenario(batch_size, - seq_len, - is_prompt, - kv_caches, - temperature=1.0) - self.warmed_sampler_bs.append(batch_size) - def warmup_graphs(self, strategy, buckets, @@ -1604,6 +1594,7 @@ def warmup_graphs(self, f'Unsupported graph allocation strategy: {strategy}') buckets = list(sorted(buckets, key=ordering)) captured_all = True + warmed_random_sampler_bs: Set[int] = set() for idx, (batch_size, seq_len) in enumerate(buckets): # Graph memory usage is proportional to seq dimension in a batch batch_seq = batch_size * seq_len if is_prompt else batch_size @@ -1617,7 +1608,13 @@ def warmup_graphs(self, self.graphed_buckets.add(graphed_bucket) self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) with HabanaMemoryProfiler() as mem_prof: - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + self.warmup_scenario(batch_size, + seq_len, + is_prompt, + kv_caches, + temperature=1.0 if batch_size not + in warmed_random_sampler_bs else 0) + warmed_random_sampler_bs.append(batch_size) used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) available_mem -= used_mem From 76aa48a212bb83ff392ddfd92d4e9de2f09cdefb Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 17:22:49 +0200 Subject: [PATCH 7/8] Bug fix --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2063be60ca358..c49e963fbf7ec 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1614,7 +1614,7 @@ def warmup_graphs(self, kv_caches, temperature=1.0 if batch_size not in warmed_random_sampler_bs else 0) - warmed_random_sampler_bs.append(batch_size) + warmed_random_sampler_bs.add(batch_size) used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) available_mem -= used_mem From e24a5af4ff9107d07cc28b156c3c3057ee4934eb Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 15 Nov 2024 17:27:13 +0200 Subject: [PATCH 8/8] Formatting --- vllm/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c49e963fbf7ec..4b508c8b1a0f2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1612,8 +1612,8 @@ def warmup_graphs(self, seq_len, is_prompt, kv_caches, - temperature=1.0 if batch_size not - in warmed_random_sampler_bs else 0) + temperature=1.0 if batch_size + not in warmed_random_sampler_bs else 0) warmed_random_sampler_bs.add(batch_size) used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX)