bytedance · shengnxu · Sep 25, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 28, 2024
diff --git a/byte_infer_perf/llm_perf/backends/GPU/gpu_mp_engine.py b/byte_infer_perf/llm_perf/backends/GPU/gpu_mp_engine.py
@@ -133,7 +133,7 @@ def signal_handler(signum, frame):
                 logger.info(f"rank {local_rank} received signal {signum}, exiting...")
                 if hasattr(model, 'finalize_inference'):
                     model.finalize_inference()
-                os._exit(0)
+                sys.exit(0)
 
             signal.signal(signal.SIGINT, signal_handler)
             signal.signal(signal.SIGTERM, signal_handler)
@@ -195,4 +195,98 @@ def mp_forward(self, *args):
         output_dict = self._output_queues.get(block=True)
 
         return output_dict
-
+
+# ROCM_HIPGRAPH modify
+class GpuMpEngineWithGraph(GpuMpEngine):
+    def __init__(self, world_size: int, model_impl: nn.Module, xpu_cfg) -> None:
+        super().__init__(world_size, model_impl, xpu_cfg)
+        logger.info("@@@@@@@@@@ GpuMpEngineWithGraph")
+
+    @torch.no_grad()
+    def mp_loop_worker(
+        self, 
+        local_rank: int, 
+        world_size: int, 
+        input_queue: Queue, 
+        output_queue: Queue, 
+        model_impl, 
+        xpu_config
+    ):
+        try:
+            torch.manual_seed(1)
+
+            # set rank and world_size
+            os.environ["RANK"] = str(local_rank)
+            os.environ["LOCAL_RANK"] = str(local_rank)
+            os.environ["WORLD_SIZE"] = str(world_size)
+            os.environ["LOCAL_WORLD_SIZE"] = str(world_size)
+
+            # create and init model based on model_impl and xpu_config
+            model = model_impl(xpu_config)
+            if hasattr(model, 'init_inference'):
+                model.init_inference()
+
+            def signal_handler(signum, frame):
+                logger.info(f"rank {local_rank} received signal {signum}, exiting...")
+                if hasattr(model, 'finalize_inference'):
+                    model.finalize_inference()
+                sys.exit(0)
+
+            signal.signal(signal.SIGINT, signal_handler)
+            signal.signal(signal.SIGTERM, signal_handler)
+
+            # current rank is ready
+            output_queue.put("ready", block=True)
+            logger.info(f"{local_rank}/{world_size} rank is ready")
+
+            graph = torch.cuda.CUDAGraph()
+
+            # model process loop
+            while True:
+                (
+                    forward_inputs,
+                ) = input_queue.get(block=True)
+
+                # this is the capture phase of graph
+                if 'capture' in forward_inputs:
+                    graph.reset()   # reset cuda graph each time
+                    inputs_dict = self.build_inputs(forward_inputs)
+                    # model.forward(inputs_dict)
+                    torch.cuda.synchronize()
+                    with torch.cuda.graph(graph):
+                        model.forward(inputs_dict)
+                    torch.cuda.synchronize()
+                    continue
+
+                log = forward_inputs.get("log", False)
+                workspace = forward_inputs.get("workspace", None)
+
+                forward_inputs["log_file"] = None
+                if log and workspace is not None:
+                    workspace_dir = workspace / f"rank_{local_rank}"
+                    workspace_dir.mkdir(exist_ok=True, parents=True)
+                    forward_inputs["log_file"] = open(workspace_dir / "run.log", "w")
+
+
+                inputs_dict = self.build_inputs(forward_inputs)
+                start_time = time.perf_counter_ns()
+
+                # output_dict = model.forward(inputs_dict)
+                graph.replay()
+
+                torch.cuda.synchronize()
+                end_time = time.perf_counter_ns()
+                duration_ms = round((end_time - start_time) / 1e6, 3)
+                output_dict = dict()
+                output_dict["duration_ms"] = duration_ms
+
+                # TP realization: rank0 send result back to main process
+                if local_rank == 0:
+                    output_queue.put(output_dict)
+
+                if log and workspace is not None:
+                    forward_inputs["log_file"].close()
+
+        except Exception as e:
+            logger.exception(f"[BUG] engine _load_and_listen failed, no more requests will be handled. {e}")
+            output_queue.put(RuntimeError("[BUG] fatal exception in model subprocess"))
diff --git a/byte_infer_perf/llm_perf/backends/ROCM/gpu_ckpt_loader.py b/byte_infer_perf/llm_perf/backends/ROCM/gpu_ckpt_loader.py
@@ -0,0 +1,51 @@
+import torch
+import torch.distributed as dist
+
+from llm_perf.core.ckpt_loader import CoreCkptLoader
+
+class GpuCkptLoader(CoreCkptLoader):
+    def __init__(
+        self, 
+        prefix, model, 
+        mp_size=1, mp_rank=0, 
+        ckpt_path: str=""
+    ):
+        super().__init__(prefix, model, mp_size, mp_rank, ckpt_path)
+
+
+    def weight_to_device(self, weight : torch.Tensor, non_blocking=False):
+        if self.mp_rank == 0:
+            weight = weight.cuda(non_blocking=non_blocking)
+        else:
+            cur_device = torch.cuda.current_device()
+            weight = torch.empty_like(weight, device=f"cuda:{cur_device}")
+        return weight
+
+
+    def broadcast_weight(self, key, device='cpu', non_blocking=False):   
+        if self.mp_rank != 0:
+            tensor_shape = self.state_dict[key]["shape"]
+            tensor_dtype = self.state_dict[key]["dtype"]
+            tensor = torch.empty(tensor_shape, dtype=tensor_dtype)
+        else:
+            tensor = self.state_dict[key].cpu()
+        tensor_gpu = self.weight_to_device(tensor, non_blocking=non_blocking)
+        dist.broadcast(tensor_gpu, src=0)
+        self.state_dict[key] = tensor_gpu
+
+
+    def scatter_weight(self, key, dim, split_mode='default', outter=1, device='cpu', non_blocking=False):
+        self.broadcast_weight(key, non_blocking=non_blocking)
+        weight = self.state_dict[key]
+
+        if split_mode == 'default':
+            weight_split = self.split(weight, dim)
+        elif split_mode == 'with_outter':
+            weight_split = self.with_outter_split(weight, dim, outter)
+        elif split_mode == 'split_outter':
+            weight_split = self.split(weight, dim, outter)
+        else:
+            assert False, f"unknown split mode {split_mode}"
+
+        weight_split = [x.contiguous() for x in weight_split]
+        self.state_dict[key] = weight_split[self.mp_rank]
diff --git a/byte_infer_perf/llm_perf/backends/ROCM/gpu_inferencer.py b/byte_infer_perf/llm_perf/backends/ROCM/gpu_inferencer.py
@@ -0,0 +1,131 @@
+import os
+from typing import Dict, List, Any
+from dataclasses import dataclass
+
+from llm_perf.core.generation import GenerateRequest
+from llm_perf.core.inferencer import CoreInferencer
+from llm_perf.backends.ROCM.gpu_mp_engine import GpuMpEngine
+from llm_perf.utils.logger import logger
+
+class GpuInferencer(CoreInferencer):
+    def __init__(self, model_impl, xpu_cfg):
+        super().__init__()
+
+        self.tp_size = xpu_cfg["tp_size"]
+        self.pad_token_id = xpu_cfg["pad_token_id"]
+        self.max_batch_size = xpu_cfg["max_batch_size"]
+        self.mp_engine = GpuMpEngine(self.tp_size, model_impl, xpu_cfg)    
+
+    def prepare_inputs(
+        self, 
+        tasks: List[CoreInferencer.Task], 
+        **kwargs
+    ):
+        input_dict = {
+            "input_ids": None, 
+            "position_ids": None, 
+            "attention_mask": None, 
+            "all_q_len": None, 
+            "all_kv_len": None, 
+            "is_context": None, 
+            "valid_slot_ids": None
+        }
+
+        is_context = kwargs.get("is_context") if "is_context" in kwargs.keys() else False
+        valid_slot_ids = kwargs.get("valid_slot_ids") if "valid_slot_ids" in kwargs.keys() else [i for i in range(self.max_batch_size)]
+
+
+        get_input_logits = False
+        for task in tasks:
+            if task.request.generate_config.get_input_logits:
+                get_input_logits = True
+                break
+
+        input_dict["is_context"] = is_context
+        input_dict["valid_slot_ids"] = valid_slot_ids
+        input_dict["get_input_logits"] = get_input_logits
+
+        if is_context:
+            q_len = len(tasks[0].request.input_ids)
+            kv_len = len(tasks[0].request.input_ids)
+
+            input_dict["input_ids"] = [
+                tasks[0].request.input_ids
+            ]
+            input_dict["position_ids"] = [
+                [i for i in range(q_len)]
+            ]
+            input_dict["attention_mask"] = [
+                [1 for _ in range(q_len)]
+            ]
+            input_dict["all_q_len"] = [
+                q_len
+            ]
+            input_dict["all_kv_len"] = [
+                kv_len
+            ]
+        else:
+            all_input_ids = []
+            all_position_ids = []
+            all_attention_mask = []
+            all_q_len = []
+            all_kv_len = []
+
+            for task in tasks:
+                q_len = 1
+                kv_len = 0
+
+                if task is None:
+                    kv_len = 1
+
+                    input_ids = [
+                        self.pad_token_id
+                    ]
+                    position_ids = [
+                        0
+                    ]
+                    attention_mask = [
+                        0
+                    ]
+                else:
+                    kv_len = len(task.request.input_ids) + len(task.generate_ids) - 1
+
+                    input_ids = [
+                        task.generate_ids[-1]
+                    ]
+                    position_ids = [
+                        kv_len
+                    ]
+                    attention_mask = [
+                        1
+                    ]
+                all_input_ids.append(input_ids)
+                all_position_ids.append(position_ids)
+                all_attention_mask.append(attention_mask)
+                all_q_len.append(q_len)
+                all_kv_len.append(kv_len)
+
+            input_dict["input_ids"] = all_input_ids
+            input_dict["position_ids"] = all_position_ids
+            input_dict["attention_mask"] = all_attention_mask
+            input_dict["all_q_len"] = all_q_len
+            input_dict["all_kv_len"] = all_kv_len
+
+        return input_dict
+
+
+    def infer(
+        self, 
+        tasks: List[CoreInferencer.Task],  
+        **kwargs
+    ):
+        input_dict = self.prepare_inputs(tasks, **kwargs)
+        output_dict = self.mp_engine.mp_forward(input_dict)
+
+        logits = output_dict["logits"]
+        next_token_logits = logits[:, -1, :].contiguous()
+        infer_outputs = {
+            "logits": logits, 
+            "last_logits": next_token_logits
+        }
+        return infer_outputs