From d2128b456ed44064637cb989499beb093865feab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Kot=C5=82owski?= Date: Mon, 16 Dec 2024 10:45:35 +0100 Subject: [PATCH 1/7] Remove workaround for one_hot in eager/compile (#632) As now one_hot operator has implementation for eager and compile mode the workaround is not needed any longer. --- vllm/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index def57fd0965ef..b51f6c1a88f0d 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -296,11 +296,11 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype): attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - if not is_fake_hpu() and htorch.utils.internal.is_lazy(): + if not is_fake_hpu(): block_mapping = torch.nn.functional.one_hot(metadata.block_groups, num_classes=batch_size) else: - # Unfortunately one_hot on CPU/torch.compile mode/eager mode + # Unfortunately one_hot on CPU # doesn't handle out of bounds classes so we need to convert # all negative values to 0 (block_mapping) or bs (block_groups) block_groups = metadata.block_groups.to(torch.long) From 11c07e3e44f751ba3f7d1bbf75d563741704978f Mon Sep 17 00:00:00 2001 From: Nir David <124874956+nirda7@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:13:14 +0200 Subject: [PATCH 2/7] Add shutdown_inc method to MultiprocessingHPUExecutor (#634) --- vllm/executor/multiproc_hpu_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/executor/multiproc_hpu_executor.py b/vllm/executor/multiproc_hpu_executor.py index 118a18c02b072..a82fff956738f 100644 --- a/vllm/executor/multiproc_hpu_executor.py +++ b/vllm/executor/multiproc_hpu_executor.py @@ -42,6 +42,9 @@ def _check_executor_parameters(self): f"please ensure that world_size ({world_size}) " f"is less than than max local hpu count ({hpu_device_count})") + def shutdown_inc(self): + self._run_workers("shutdown_inc") + def __del__(self): self.shutdown() From ba1d24baa6abf4e749a8809242c905b7d4881540 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Mon, 16 Dec 2024 15:54:00 +0100 Subject: [PATCH 3/7] Fix recompilations due to different batch_sizes in MSS (#637) Fix for batch size padding in multi-step scheduling by @SanjuCSudhakaran. Co-authored-by: Sanju C Sudhakaran --- vllm/worker/hpu_model_runner.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b51f6c1a88f0d..7c3679d40546d 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2019,6 +2019,19 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], return lora_mask, lora_logits_mask + def add_dummy_seq(self, seq_group_metadata_list, is_prompt): + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) + batch_size_padding = batch_size_padded - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + if batch_size_padding > 0: + dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( + 0, 0, is_prompt) + seq_group_metadata_list.extend(dummy_seq_group_metadata + for _ in range(batch_size_padding)) + return seq_group_metadata_list + @torch.inference_mode() def execute_model( self, @@ -2105,8 +2118,8 @@ def execute_model( def try_revert_dummy_output_tokens(): if len(cache_orig_output_tokens_len) > 0: # Reuse the original output token ids length - for i, seq_group_metadata in enumerate( - seq_group_metadata_list): + for i in range(len(cache_orig_output_tokens_len)): + seq_group_metadata = seq_group_metadata_list[i] for j, data in seq_group_metadata.seq_data.items(): orig_output_tokens_len = \ cache_orig_output_tokens_len[i][j] @@ -2184,7 +2197,7 @@ def try_revert_dummy_output_tokens(): else: raise RuntimeError( "seq_group_metadata_list is uninitialized") - for i, seq_group_metadata in enumerate( + for seq_idx, seq_group_metadata in enumerate( seq_group_metadata_list): # Skip empty steps seq_group_metadata.state.current_step += ( @@ -2192,8 +2205,10 @@ def try_revert_dummy_output_tokens(): # Cache the original output token ids cache_orig_output_tokens_len.append({}) for j, data in seq_group_metadata.seq_data.items(): - cache_orig_output_tokens_len[i][j] = \ + cache_orig_output_tokens_len[seq_idx][j] = \ len(data.output_token_ids) + seq_group_metadata_list = self.add_dummy_seq( + seq_group_metadata_list, is_prompt=False) for seq_group_metadata in seq_group_metadata_list: for data in seq_group_metadata.seq_data.values(): max_output_len = sampling_metadata.seq_groups[ From c9a740f4dcfdace12a1ab84d3b8f9682523b6e65 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Mon, 16 Dec 2024 16:42:08 +0100 Subject: [PATCH 4/7] Fix CI reports (#636) --- .jenkins/lm-eval-harness/run-tests.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh index e090f7108bcac..2370388ebc66d 100644 --- a/.jenkins/lm-eval-harness/run-tests.sh +++ b/.jenkins/lm-eval-harness/run-tests.sh @@ -43,14 +43,16 @@ do export PT_HPU_ENABLE_LAZY_COLLECTIVES=true export VLLM_SKIP_WARMUP=true RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 Date: Mon, 16 Dec 2024 16:52:31 +0100 Subject: [PATCH 5/7] Unit scales in FP8 CI scenarios (#633) --- .../configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml | 6 +++--- .../lm-eval-harness/inc_unit_scales_config.json | 16 ++++++++++++++++ .../lm-eval-harness/test_lm_eval_correctness.py | 8 +++----- 3 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 .jenkins/lm-eval-harness/inc_unit_scales_config.json diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml index 80a8c522bc5a0..5c1cd657e8e36 100644 --- a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml @@ -5,10 +5,10 @@ tasks: - name: "gsm8k_cot_llama" metrics: - name: "exact_match,strict-match" - value: 0.8317 + value: 0.664 - name: "exact_match,flexible-extract" - value: 0.8355 -limit: null + value: 0.676 +limit: 250 num_fewshot: 8 dtype: "bfloat16" fewshot_as_multiturn: true diff --git a/.jenkins/lm-eval-harness/inc_unit_scales_config.json b/.jenkins/lm-eval-harness/inc_unit_scales_config.json new file mode 100644 index 0000000000000..cd6589c811417 --- /dev/null +++ b/.jenkins/lm-eval-harness/inc_unit_scales_config.json @@ -0,0 +1,16 @@ +{ + "mode": "QUANTIZE", + "observer": "maxabs", + "scale_method": "unit_scale", + "allowlist": { + "types": [], + "names": [] + }, + "blocklist": { + "types": [], + "names": [ + "lm_head" + ] + }, + "dump_stats_path": "" +} \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index 9272123034350..55d633e51ce97 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -27,12 +27,10 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) -def setup_fp8(model_path, device_type): - flavor = f"g{device_type[-1]}" - normalized_model_name = Path(model_path).parts[-1].lower() +def setup_fp8(): os.environ[ "QUANT_CONFIG"] = \ - f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json" + "inc_unit_scales_config.json" def fail_on_exit(): @@ -147,7 +145,7 @@ def test_lm_eval_correctness(record_xml_attribute, record_property): # Set up environment for FP8 inference if eval_config.get("fp8"): - setup_fp8(eval_config["model_name"], platform) + setup_fp8() # Launch eval requests. start_time = time.perf_counter() results = launch_lm_eval(eval_config) From d81f829e170d40f303e1a287d7278da8166a50d9 Mon Sep 17 00:00:00 2001 From: RafLit Date: Wed, 18 Dec 2024 13:27:43 +0100 Subject: [PATCH 6/7] TC llama recompile fix - no_grad to inference_mode (#640) during warmup the inference mode is used, but at runtime it's overwritten by inference mode - this causes recompilations due to dispatch key mismatch in torch.compile. This switches the no_grad mode to inference_mode from base class. --------- Co-authored-by: Rafal Litka --- vllm/platforms/hpu.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index ee83187fff797..314cd98212e9c 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,5 @@ from typing import TYPE_CHECKING, Optional -import torch - from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: @@ -24,10 +22,6 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): - return torch.no_grad() - @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: From 88ef38104a7746f7a5dd7042a35d1df81e04320b Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Wed, 18 Dec 2024 14:48:23 +0100 Subject: [PATCH 7/7] Generic call for prepare_cos_sin in rotary embedding (#638) Generic name discovery for rope.prepare_cos_sin. It fixes errors in models that don't follow a specific naming hierarchy --- vllm/worker/hpu_model_runner.py | 99 ++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7c3679d40546d..d3090d313d155 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -169,40 +169,37 @@ def forward_hook(module, args, output): modify_decoder_layer(child_module, suffix, n, counter) -def get_names_for_rope(model: torch.nn.Module): - """Dynamically get layer names needed for cos and sin preparation for rope. - - Every model can have a different naming convention for it's layers. - This function dynamically retrieves layer names to access rope layer. - If there's no rope layer, the function returns None. - - This function assumes the following layer type layout: - Model -> ModuleList -> Attention -> RotaryEmbedding +def get_path_to_rope(model: torch.nn.Module): + """Dynamically get the path to the RotaryEmbedding layer in the model. + This function will recursively search through the module hierarchy to find + a RotaryEmbedding layer and return the full path to that layer as a list + of names. + If no such layer is found, it returns None. """ - def get_child(parent, suffix, is_list=False): + def find_rope_layer(parent, path): + # Base case: check if this parent is None if parent is None: - return None, None - parent = parent[0] if is_list else parent - for child_name, child_module in parent.named_children(): - if child_module.__class__.__name__.endswith(suffix): - return child_name, child_module - return None, None - - model_name, model_module = get_child(model, "Model") - layers_name, layers_module = get_child(model_module, "ModuleList") - attn_name, attn_module = get_child(layers_module, - "Attention", - is_list=True) - rope_name, _ = get_child(attn_module, "RotaryEmbedding") - - if rope_name is not None: - return { - 'model_name': model_name, - 'layers_name': layers_name, - 'attn_name': attn_name, - 'rope_name': rope_name - } + return None + + # Check if the current layer is a RotaryEmbedding + if hasattr(parent, 'named_children'): + for child_name, child_module in parent.named_children(): + # If the current child is of type RotaryEmbedding, + # return the full path + if child_module.__class__.__name__.endswith("RotaryEmbedding"): + return path + [child_name] + # Otherwise, recurse into this child to check its children + result = find_rope_layer(child_module, path + [child_name]) + if result is not None: + return result + return None + + # Start the search from the top level model + path_to_rope = find_rope_layer(model, []) + + # Return the result if found, otherwise None + return path_to_rope class HpuModelAdapter: @@ -353,17 +350,31 @@ def _update_metadata(self, attn_metadata, batch_size, seq_len, device, return attn_metadata def _prepare_cos_sin(self, positions): - model_name = self.layer_names['model_name'] - layers_name = self.layer_names['layers_name'] - attn_name = self.layer_names['attn_name'] - rope_name = self.layer_names['rope_name'] - - base_model = getattr(self.model, model_name) - first_model_layer = getattr(base_model, layers_name)[0] - attention_layer = getattr(first_model_layer, attn_name) - rope = getattr(attention_layer, rope_name) - - rope.prepare_cos_sin(positions) + """Navigate through the model using the provided path and call + the prepare_cos_sin method on the 'RotaryEmbedding' layer.""" + + current_module = self.model # Start from the top level of the model + + for layer in self.layer_names: + if layer.isdigit(): # Check if the layer is an index + layer = int(layer) + + # Check if the current layer is a name in a module + if isinstance( + layer, + str) and not isinstance(layer, int): # Name-based access + current_module = getattr(current_module, layer) + elif isinstance(layer, + int): # Indexed-based access (like ModuleList) + current_module = list(current_module._modules.values())[layer] + + # At the end, we should be at the RotaryEmbedding layer. + if hasattr(current_module, 'prepare_cos_sin'): + current_module.prepare_cos_sin(positions) + else: + raise AttributeError( + "The module at the end of the path does not have \ + a 'prepare_cos_sin' method.") def forward(self, *args, **kwargs): kwargs = kwargs.copy() @@ -744,7 +755,7 @@ def load_model(self) -> None: get_decoder_layer_suffix(model_config.model_type if model_config is not None else None), hidden_layer_markstep_interval) - names_for_rope = get_names_for_rope(self.model) + path_to_rope = get_path_to_rope(self.model) torch.hpu.synchronize() with HabanaMemoryProfiler() as m_wrap: @@ -753,7 +764,7 @@ def load_model(self) -> None: self.block_size, dtype=self.model_config.dtype, enforce_eager=self.enforce_eager, - layer_names=names_for_rope) + layer_names=path_to_rope) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg)