diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py index 1b136fcd98d29..1ec3ef3b1ea1d 100644 --- a/vllm/worker/hpu_enc_dec_model_runner.py +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -27,7 +27,6 @@ _add_sampling_metadata_broadcastable_dict) from vllm.distributed import broadcast_tensor_dict - if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -540,7 +539,6 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) - def add_dummy_seq(self, seq_group_metadata_list, is_prompt): real_batch_size = len(seq_group_metadata_list) batch_size_padded = self.bucketing_ctx.get_padded_batch_size( @@ -554,7 +552,6 @@ def add_dummy_seq(self, seq_group_metadata_list, is_prompt): for _ in range(batch_size_padding)) return seq_group_metadata_list - @torch.inference_mode() def execute_model( self, @@ -593,7 +590,6 @@ def execute_model( use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) self._check_config(batch_size, seq_len, is_prompt, warmup_mode) - execute_model_kwargs = { "input_ids": input_tokens, "positions": input_positions, @@ -776,7 +772,6 @@ def try_revert_dummy_output_tokens(): return output if type(output) is list else [output] - def _decode_sampler_outputs(self, model_input): use_async_out_proc = model_input.async_callback is not None sampler_outputs = [] @@ -806,7 +801,6 @@ def _decode_sampler_outputs(self, model_input): else: return sampler_outputs - def _make_decode_output( self, next_token_ids: List[List[int]], @@ -827,4 +821,4 @@ def _make_decode_output( sampler_outputs.append( CompletionSequenceGroupOutput(seq_outputs, None)) return SamplerOutput(sampler_outputs) - \ No newline at end of file +