diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py index 55252965..8d9f32d7 100644 --- a/benchmarks/nightly/run.py +++ b/benchmarks/nightly/run.py @@ -66,6 +66,7 @@ def setup_tritonbench_cwd(): def reduce(run_timestamp, output_dir, output_files, args): """aggregate all op benchmark csvs into json file""" + from tritonbench.utils.gpu_utils import get_nvidia_gpu_states, has_nvidia_smi from tritonbench.utils.path_utils import REPO_PATH from tritonbench.utils.run_utils import get_github_env, get_run_env @@ -80,6 +81,13 @@ def reduce(run_timestamp, output_dir, output_files, args): "env": get_run_env(run_timestamp, repo_locs), "metrics": {}, } + if has_nvidia_smi(): + aggregated_obj.update( + { + "nvidia_gpu_states": get_nvidia_gpu_states(), + } + ) + # Collecting GitHub environment variables when running in CI environment if args.ci: aggregated_obj["github"] = get_github_env() diff --git a/tritonbench/utils/gpu_utils.py b/tritonbench/utils/gpu_utils.py index 70ddc668..7bbd1f10 100644 --- a/tritonbench/utils/gpu_utils.py +++ b/tritonbench/utils/gpu_utils.py @@ -2,7 +2,7 @@ import os import subprocess from contextlib import contextmanager -from typing import Dict +from typing import Dict, List, Optional # NVIDIA A100 GPU Spec: # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf @@ -141,3 +141,65 @@ def gpu_lockdown(enabled=True): if enabled: gpu_name = _get_gpu_name() _reset_clock(gpu_name) + + +def _nvidia_smi_query(query: str, device_ids: Optional[List[str]] = None) -> List[str]: + if device_ids: + device_ids = [str(id) for id in device_ids] + device_ids = ",".join(device_ids) + id_selector = f"-i {device_ids}" if device_ids else "" + values = ( + subprocess.check_output( + f'nvidia-smi --query-gpu="{query}" {id_selector} --format=csv,noheader,nounits', + shell=True, + ) + .strip() + .decode() + .split("\n") + ) + return values + + +def get_nvidia_gpu_states() -> Dict[str, List[str]]: + results = {} + device_ids = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",") + # get power + raw_metrics = _nvidia_smi_query( + "power.draw.average,power.draw.instant,temperature.gpu,temperature.memory," + "clocks.current.sm,clocks.current.memory," + "clocks_throttle_reasons.hw_thermal_slowdown,clocks_throttle_reasons.sw_thermal_slowdown", + device_ids, + ) + results["power.draw.average"] = ",".join( + metric.split(",")[0].strip() for metric in raw_metrics + ) + results["power.draw.instant"] = ",".join( + metric.split(",")[1].strip() for metric in raw_metrics + ) + results["temperature.gpu"] = ",".join( + metric.split(",")[2].strip() for metric in raw_metrics + ) + results["temperature.memory"] = ",".join( + metric.split(",")[3].strip() for metric in raw_metrics + ) + results["clocks.current.sm"] = ",".join( + metric.split(",")[4].strip() for metric in raw_metrics + ) + results["clocks.current.memory"] = ",".join( + metric.split(",")[5].strip() for metric in raw_metrics + ) + results["hw_thermal_slowdown"] = ",".join( + metric.split(",")[6].strip() for metric in raw_metrics + ) + results["sw_thermal_slowdown"] = ",".join( + metric.split(",")[7].strip() for metric in raw_metrics + ) + return results + + +def has_nvidia_smi() -> bool: + try: + subprocess.check_output("nvidia-smi") + return True + except subprocess.SubprocessError: + return False