Merge branch 'habana_main' into libint/interleave_slidingwindow

HabanaAI · Jan 24, 2025 · bddb778 · bddb778
2 parents 13255ea + 8c4b41e
commit bddb778
Show file tree

Hide file tree

Showing 12 changed files with 456 additions and 196 deletions.
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -3,7 +3,7 @@
 
 # Dependencies for HPU code
 ray
-triton
+triton==3.1.0
 pandas
 tabulate
 setuptools>=61

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -407,7 +407,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'or equal to the number of GPUs available, "mp" will be used to '
             'keep processing on a single host. Otherwise, this will default '
             'to "ray" if Ray is installed and fail otherwise. Note that tpu '
-            'and hpu only support Ray for distributed inference.')
+            'only support Ray for distributed inference.')
 
         parser.add_argument(
             '--worker-use-ray',

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1284,6 +1284,10 @@ def _run_engine(
 
         if use_tqdm:
             pbar.close()
+
+        # Make sure that all workers are finished.
+        self.llm_engine.stop_remote_worker_execution_loop()
+
         # Sort the outputs by request ID.
         # This is necessary because some requests may be finished earlier than
         # its previous requests.

diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
@@ -91,8 +91,12 @@ def _init_executor(self) -> None:
                           max_parallel_loading_workers)
         self.driver_exec_model = make_async(self.driver_worker.execute_model)
         self.pp_locks: Optional[List[asyncio.Lock]] = None
+        self.shutdown_workers = True
 
     def shutdown(self):
+        if getattr(self, 'shutdown_workers', False):
+            self._run_workers("shutdown")
+            self.shutdown_workers = False
         if (worker_monitor := getattr(self, "worker_monitor",
                                       None)) is not None:
             worker_monitor.close()

diff --git a/vllm/executor/multiproc_hpu_executor.py b/vllm/executor/multiproc_hpu_executor.py
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
@@ -98,7 +98,17 @@ def _init_executor(self) -> None:
             self.driver_exec_method = make_async(
                 self.driver_worker.execute_method)
 
+        self.shutdown_workers = True
+        self.terminate_ray = True
+
     def shutdown(self) -> None:
+        if getattr(self, 'shutdown_workers', False):
+            self._run_workers("shutdown")
+            self.shutdown_workers = False
+        if getattr(self, 'terminate_ray', False):
+            for worker in self.workers:
+                worker.__ray_terminate__.remote()
+            self.terminate_ray = False
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
             self.forward_dag.teardown()
             import ray

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
@@ -39,6 +39,8 @@ def _init_executor(self) -> None:
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+        self.shutdown_worker = True
+
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
@@ -54,6 +56,11 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def shutdown(self):
+        if getattr(self, 'shutdown_worker', False):
+            self.collective_rpc("shutdown")
+            self.shutdown_worker = False
+
 
 UniProcExecutorAsync = UniProcExecutor
 
@@ -112,6 +119,8 @@ def _init_executor(self) -> None:
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+        self.shutdown_worker = True
+
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         Determine the number of available KV blocks.

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -178,9 +178,7 @@ def forward_hpu(
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
         if layer is not None:
-            return layer.hpu_static_fused_moe(x, layer.w13_weight,
-                                              layer.w2_weight, router_logits,
-                                              top_k)
+            return layer.hpu_fused_moe(x, router_logits, top_k)
 
     def forward_cpu(
         self,
@@ -300,15 +298,11 @@ def __init__(
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
         if is_hpu:
-            from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE
+            from vllm_hpu_extension.ops import DynamicFusedMOE
+            self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
 
-            from vllm.model_executor.layers.quantization.inc import INCConfig
-            selected_fused_moe = (StaticFusedMOE if isinstance(
-                quant_config, INCConfig) else DynamicFusedMOE)
-            self.hpu_static_fused_moe = selected_fused_moe(self.num_experts)
         self.scoring_func = scoring_func
         self.e_score_correction_bias = e_score_correction_bias
-
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
                              "non-grouped topk.")
@@ -404,10 +398,8 @@ def _load_w13(self,
         expert_data.copy_(loaded_weight)
 
         if is_hpu:
-            from vllm_hpu_extension.ops import StaticFusedMOE
-            if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
-                self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
-                    orig_exp_data)
+            self.hpu_fused_moe.MoeOp.w13_list[expert_id].set_weight(
+                orig_exp_data)
 
     def _load_w2(self,
                  expert_data: torch.Tensor,
@@ -426,10 +418,7 @@ def _load_w2(self,
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
         if is_hpu:
-            from vllm_hpu_extension.ops import StaticFusedMOE
-            if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
-                self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
-                    expert_data)
+            self.hpu_fused_moe.MoeOp.w2_list[expert_id].set_weight(expert_data)
 
     def _load_single_value(self, param: torch.nn.Parameter,
                            loaded_weight: torch.Tensor, expert_id: int):