HabanaAI · michalkuligowski · Dec 9, 2024 · Nov 20, 2024 · Dec 9, 2024
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@41ff369
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@48d0303
@@ -1326,9 +1326,6 @@ def _advance_to_next_step(
                 else:
                     seq.append_token_id(sample.output_token, sample.logprobs)
 
-    def finish_measurements(self):
-        self.model_executor.finish_measurements()
-
     def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 

@@ -256,10 +256,6 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
         else:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
-    def finish_measurements(self):
-        assert not envs.VLLM_USE_V1, "INC does not support vLLM V1"
-        self.llm_engine.finish_measurements()  # type: ignore[attr-defined]
-
     @overload  # LEGACY: single (prompt + optional token ids)
     @deprecated("'prompt_token_ids' will become part of 'prompts")
     def generate(

@@ -81,9 +81,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
         logger.info(msg)
 
-    def finish_measurements(self):
-        self.driver_worker.finish_measurements()
-
     def execute_model(
             self,
             execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
@@ -187,10 +184,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.driver_worker.stop_profile()
 
-    def shutdown(self) -> None:
-        if hasattr(self, "driver_worker") and hasattr(self.driver_worker,
-                                                      'shutdown_inc'):
-            self.driver_worker.shutdown_inc()
+    def shutdown_inc(self) -> None:
+        self.driver_worker.shutdown_inc()
 
 
 class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):

@@ -70,18 +70,22 @@ def _init_executor(self) -> None:
         self.output_decoder = msgspec.msgpack.Decoder(
             Optional[List[SamplerOutput]])
 
+        self.terminate_ray = True
+
     def shutdown(self) -> None:
-        for worker in self.workers:
-            worker.__ray_terminate__.remote()
+        if getattr(self, 'terminate_ray', False):
+            for worker in self.workers:
+                worker.__ray_terminate__.remote()
+            self.terminate_ray = False
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
             self.forward_dag.teardown()
             import ray
             for worker in self.workers:
                 ray.kill(worker)
             self.forward_dag = None
 
-    def finish_measurements(self):
-        self._run_workers("finish_measurements")
+    def shutdown_inc(self):
+        self._run_workers("shutdown_inc")
 
     def _get_worker_module_and_class(
         self
@@ -479,9 +483,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
 
         return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)
 
-    def __del__(self):
-        self.shutdown()
-
 
 class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync):
 
@@ -552,6 +553,3 @@ async def _start_worker_execution_loop(self):
             for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
-
-    def __del__(self):
-        self.shutdown()
@@ -1894,10 +1894,6 @@ def prepare_model_input(
                                    is_prompt=is_prompt,
                                    virtual_engine=virtual_engine)
 
-    def finish_measurements(self):
-        from neural_compressor.torch.quantization import finalize_calibration
-        finalize_calibration(self.model.model)
-
     def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode):
         cfg = (batch_size, seq_len, is_prompt)
         seen = cfg in self.seen_configs
@@ -2265,18 +2261,12 @@ def _make_decode_output(
         return SamplerOutput(sampler_outputs)
 
     def shutdown_inc(self):
-        can_finalize_inc = False
-        from contextlib import suppress
-        with suppress(AttributeError):
-            can_finalize_inc = (self.model_config.quantization == 'inc') and \
-                (self.model.model is not None) and \
-                self.inc_initialized_successfully and \
-                not getattr(self, "_is_inc_finalized", False)
+        can_finalize_inc = (self.model_config.quantization == 'inc') and \
+            (self.model.model is not None) and \
+            self.inc_initialized_successfully and \
+            not getattr(self, "_is_inc_finalized", False)
         if can_finalize_inc:
             from neural_compressor.torch.quantization import (
                 finalize_calibration)
             finalize_calibration(self.model.model)
             self._is_inc_finalized = True
-
-    def __del__(self):
-        self.shutdown_inc()
@@ -318,9 +318,6 @@ def _warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
-    def finish_measurements(self):
-        self.model_runner.finish_measurements()
-
     @property
     def do_metadata_broadcast(self) -> bool:
         return self.parallel_config.tensor_parallel_size > 1