Skip to content

Commit

Permalink
Merge branch 'habana_main' into libint/interleave_slidingwindow
Browse files Browse the repository at this point in the history
  • Loading branch information
michalkuligowski authored Jan 24, 2025
2 parents 13255ea + 8c4b41e commit bddb778
Show file tree
Hide file tree
Showing 12 changed files with 456 additions and 196 deletions.
2 changes: 1 addition & 1 deletion requirements-hpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Dependencies for HPU code
ray
triton
triton==3.1.0
pandas
tabulate
setuptools>=61
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
'or equal to the number of GPUs available, "mp" will be used to '
'keep processing on a single host. Otherwise, this will default '
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'and hpu only support Ray for distributed inference.')
'only support Ray for distributed inference.')

parser.add_argument(
'--worker-use-ray',
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1284,6 +1284,10 @@ def _run_engine(

if use_tqdm:
pbar.close()

# Make sure that all workers are finished.
self.llm_engine.stop_remote_worker_execution_loop()

# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
Expand Down
4 changes: 4 additions & 0 deletions vllm/executor/mp_distributed_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,12 @@ def _init_executor(self) -> None:
max_parallel_loading_workers)
self.driver_exec_model = make_async(self.driver_worker.execute_model)
self.pp_locks: Optional[List[asyncio.Lock]] = None
self.shutdown_workers = True

def shutdown(self):
if getattr(self, 'shutdown_workers', False):
self._run_workers("shutdown")
self.shutdown_workers = False
if (worker_monitor := getattr(self, "worker_monitor",
None)) is not None:
worker_monitor.close()
Expand Down
57 changes: 0 additions & 57 deletions vllm/executor/multiproc_hpu_executor.py

This file was deleted.

10 changes: 10 additions & 0 deletions vllm/executor/ray_distributed_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,17 @@ def _init_executor(self) -> None:
self.driver_exec_method = make_async(
self.driver_worker.execute_method)

self.shutdown_workers = True
self.terminate_ray = True

def shutdown(self) -> None:
if getattr(self, 'shutdown_workers', False):
self._run_workers("shutdown")
self.shutdown_workers = False
if getattr(self, 'terminate_ray', False):
for worker in self.workers:
worker.__ray_terminate__.remote()
self.terminate_ray = False
if hasattr(self, "forward_dag") and self.forward_dag is not None:
self.forward_dag.teardown()
import ray
Expand Down
9 changes: 9 additions & 0 deletions vllm/executor/uniproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def _init_executor(self) -> None:
self.collective_rpc("init_device")
self.collective_rpc("load_model")

self.shutdown_worker = True

def collective_rpc(self,
method: Union[str, Callable],
timeout: Optional[float] = None,
Expand All @@ -54,6 +56,11 @@ def check_health(self) -> None:
# it's running.
return

def shutdown(self):
if getattr(self, 'shutdown_worker', False):
self.collective_rpc("shutdown")
self.shutdown_worker = False


UniProcExecutorAsync = UniProcExecutor

Expand Down Expand Up @@ -112,6 +119,8 @@ def _init_executor(self) -> None:
self.collective_rpc("init_device")
self.collective_rpc("load_model")

self.shutdown_worker = True

def determine_num_available_blocks(self) -> Tuple[int, int]:
"""
Determine the number of available KV blocks.
Expand Down
23 changes: 6 additions & 17 deletions vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,7 @@ def forward_hpu(
'not supported on HPU')
assert topk_group is None, 'topk_group is not supported on HPU'
if layer is not None:
return layer.hpu_static_fused_moe(x, layer.w13_weight,
layer.w2_weight, router_logits,
top_k)
return layer.hpu_fused_moe(x, router_logits, top_k)

def forward_cpu(
self,
Expand Down Expand Up @@ -300,15 +298,11 @@ def __init__(
self.topk_group = topk_group
self.custom_routing_function = custom_routing_function
if is_hpu:
from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE
from vllm_hpu_extension.ops import DynamicFusedMOE
self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)

from vllm.model_executor.layers.quantization.inc import INCConfig
selected_fused_moe = (StaticFusedMOE if isinstance(
quant_config, INCConfig) else DynamicFusedMOE)
self.hpu_static_fused_moe = selected_fused_moe(self.num_experts)
self.scoring_func = scoring_func
self.e_score_correction_bias = e_score_correction_bias

if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.")
Expand Down Expand Up @@ -404,10 +398,8 @@ def _load_w13(self,
expert_data.copy_(loaded_weight)

if is_hpu:
from vllm_hpu_extension.ops import StaticFusedMOE
if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
orig_exp_data)
self.hpu_fused_moe.MoeOp.w13_list[expert_id].set_weight(
orig_exp_data)

def _load_w2(self,
expert_data: torch.Tensor,
Expand All @@ -426,10 +418,7 @@ def _load_w2(self,
# w2, down_proj: Load into only logical weight of w2.
expert_data.copy_(loaded_weight)
if is_hpu:
from vllm_hpu_extension.ops import StaticFusedMOE
if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
expert_data)
self.hpu_fused_moe.MoeOp.w2_list[expert_id].set_weight(expert_data)

def _load_single_value(self, param: torch.nn.Parameter,
loaded_weight: torch.Tensor, expert_id: int):
Expand Down
Loading

0 comments on commit bddb778

Please sign in to comment.