diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 0279939bb0ce6..59a97edbc3fbf 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@41ff369 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@48d0303 diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2b19668cca79b..36dc41955d887 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1326,9 +1326,6 @@ def _advance_to_next_step( else: seq.append_token_id(sample.output_token, sample.logprobs) - def finish_measurements(self): - self.model_executor.finish_measurements() - def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 81f0886932843..65fa9873df28c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -256,10 +256,6 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: else: tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) - def finish_measurements(self): - assert not envs.VLLM_USE_V1, "INC does not support vLLM V1" - self.llm_engine.finish_measurements() # type: ignore[attr-defined] - @overload # LEGACY: single (prompt + optional token ids) @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index c7dfe9848c635..02a7bcc72b8ce 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -81,9 +81,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" logger.info(msg) - def finish_measurements(self): - self.driver_worker.finish_measurements() - def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: @@ -187,10 +184,8 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.driver_worker.stop_profile() - def shutdown(self) -> None: - if hasattr(self, "driver_worker") and hasattr(self.driver_worker, - 'shutdown_inc'): - self.driver_worker.shutdown_inc() + def shutdown_inc(self) -> None: + self.driver_worker.shutdown_inc() class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 1c6825d8d365b..3b3a3bc3da42c 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -70,9 +70,13 @@ def _init_executor(self) -> None: self.output_decoder = msgspec.msgpack.Decoder( Optional[List[SamplerOutput]]) + self.terminate_ray = True + def shutdown(self) -> None: - for worker in self.workers: - worker.__ray_terminate__.remote() + if getattr(self, 'terminate_ray', False): + for worker in self.workers: + worker.__ray_terminate__.remote() + self.terminate_ray = False if hasattr(self, "forward_dag") and self.forward_dag is not None: self.forward_dag.teardown() import ray @@ -80,8 +84,8 @@ def shutdown(self) -> None: ray.kill(worker) self.forward_dag = None - def finish_measurements(self): - self._run_workers("finish_measurements") + def shutdown_inc(self): + self._run_workers("shutdown_inc") def _get_worker_module_and_class( self @@ -479,9 +483,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool): return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) - def __del__(self): - self.shutdown() - class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync): @@ -552,6 +553,3 @@ async def _start_worker_execution_loop(self): for worker in self.non_driver_workers ] return await asyncio.gather(*coros) - - def __del__(self): - self.shutdown() diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 17ddc7e12926d..d822d4e4898d6 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1894,10 +1894,6 @@ def prepare_model_input( is_prompt=is_prompt, virtual_engine=virtual_engine) - def finish_measurements(self): - from neural_compressor.torch.quantization import finalize_calibration - finalize_calibration(self.model.model) - def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): cfg = (batch_size, seq_len, is_prompt) seen = cfg in self.seen_configs @@ -2265,18 +2261,12 @@ def _make_decode_output( return SamplerOutput(sampler_outputs) def shutdown_inc(self): - can_finalize_inc = False - from contextlib import suppress - with suppress(AttributeError): - can_finalize_inc = (self.model_config.quantization == 'inc') and \ - (self.model.model is not None) and \ - self.inc_initialized_successfully and \ - not getattr(self, "_is_inc_finalized", False) + can_finalize_inc = (self.model_config.quantization == 'inc') and \ + (self.model.model is not None) and \ + self.inc_initialized_successfully and \ + not getattr(self, "_is_inc_finalized", False) if can_finalize_inc: from neural_compressor.torch.quantization import ( finalize_calibration) finalize_calibration(self.model.model) self._is_inc_finalized = True - - def __del__(self): - self.shutdown_inc() diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 1ab47e7de80f5..ea0c9e345f536 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -318,9 +318,6 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def finish_measurements(self): - self.model_runner.finish_measurements() - @property def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1