From 5ae35e1ee101118bf118dbc8fb8f3c923009f57a Mon Sep 17 00:00:00 2001 From: Dominic Sloan-Murphy Date: Wed, 8 Jan 2025 13:45:38 +0000 Subject: [PATCH] refactor(slurmd): improve comments and function names. Simplify try/catch blocks. Add better support for comma-separated values in `get_slurmd_info` --- charms/slurmd/src/charm.py | 23 +++++++---------------- charms/slurmd/src/utils/gpu.py | 30 +++++++----------------------- charms/slurmd/src/utils/machine.py | 21 ++++++++++++--------- 3 files changed, 26 insertions(+), 48 deletions(-) diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index d5f784e..1e02940 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -334,7 +334,7 @@ def _check_status(self) -> bool: def _reboot_if_required(self, now: bool = False) -> None: """Perform a reboot of the unit if required, e.g. following a driver installation.""" if Path("/var/run/reboot-required").exists(): - logger.info("unit rebooting") + logger.info("rebooting unit %s", self.unit.name) self.unit.reboot(now) @staticmethod @@ -373,34 +373,25 @@ def get_node(self) -> Dict[Any, Any]: """Get the node from stored state.""" slurmd_info = machine.get_slurmd_info() - # Get GPU info and build GRES configuration. gres_info = [] - if gpus := gpu.get_gpus(): + if gpus := gpu.get_all_gpu(): for model, devices in gpus.items(): # Build gres.conf line for this GPU model. if len(devices) == 1: device_suffix = next(iter(devices)) else: - # For multi-gpu setups, "File" uses ranges and strides syntax, - # e.g. File=/dev/nvidia[0-3], File=/dev/nvidia[0,2-3] + # Get numeric range of devices associated with this GRES resource. See: + # https://slurm.schedmd.com/gres.conf.html#OPT_File device_suffix = self._ranges_and_strides(sorted(devices)) gres_line = { - # NodeName included in node_parameters. "Name": "gpu", "Type": model, "File": f"/dev/nvidia{device_suffix}", } - # Append to list of GRES lines for all models gres_info.append(gres_line) - - # Add to node parameters to ensure included in slurm.conf. - slurm_conf_gres = f"gpu:{model}:{len(devices)}" - try: - # Add to existing Gres line. - slurmd_info["Gres"].append(slurm_conf_gres) - except KeyError: - # Create a new Gres entry if none present - slurmd_info["Gres"] = [slurm_conf_gres] + slurmd_info["Gres"] = cast(list[str], slurmd_info.get("Gres", [])) + [ + f"gpu:{model}:{len(devices)}" + ] node = { "node_parameters": { diff --git a/charms/slurmd/src/utils/gpu.py b/charms/slurmd/src/utils/gpu.py index 7b02759..9d6daf5 100644 --- a/charms/slurmd/src/utils/gpu.py +++ b/charms/slurmd/src/utils/gpu.py @@ -22,7 +22,7 @@ _logger = logging.getLogger(__name__) -class GPUInstallError(Exception): +class GPUOpsError(Exception): """Exception raised when a GPU driver installation operation failed.""" @@ -36,7 +36,7 @@ def __init__(self): try: apt.add_package(pkgs, update_cache=True) except (apt.PackageNotFoundError, apt.PackageError) as e: - raise GPUInstallError(f"failed to install {pkgs} reason: {e}") + raise GPUOpsError(f"failed to install {pkgs} reason: {e}") # ubuntu-drivers requires apt_pkg for package operations self._detect = _import("UbuntuDrivers.detect") @@ -54,9 +54,8 @@ def _get_linux_modules_metapackage(self, driver) -> str: """ return self._detect.get_linux_modules_metapackage(self._apt_pkg.Cache(None), driver) - def system_packages(self) -> list: + def system_packages(self) -> list[str]: """Return a list of GPU drivers and kernel module packages for this node.""" - # Detect only GPGPU drivers. Not general purpose graphics drivers. packages = self._system_gpgpu_driver_packages() # Gather list of driver and kernel modules to install. @@ -76,11 +75,6 @@ def system_packages(self) -> list: # Add to list of packages to install install_packages += [driver_metapackage, modules_metapackage] - # TODO: do we want to check for nvidia here and add nvidia-fabricmanager-535 libnvidia-nscq-535 in case of nvlink? This is suggested as a manual step at https://documentation.ubuntu.com/server/how-to/graphics/install-nvidia-drivers/#optional-step. If so, how do we get the version number "-535" robustly? - - # TODO: what if drivers install but do not require a reboot? Should we "modprobe nvidia" manually? Just always reboot regardless? - - # Filter out any empty results as returning return [p for p in install_packages if p] @@ -88,7 +82,7 @@ def autoinstall() -> None: """Autodetect available GPUs and install drivers. Raises: - GPUInstallError: Raised if error is encountered during package install. + GPUOpsError: Raised if error is encountered during package install. """ _logger.info("detecting GPUs and installing drivers") detector = GPUDriverDetector() @@ -102,10 +96,10 @@ def autoinstall() -> None: try: apt.add_package(install_packages) except (apt.PackageNotFoundError, apt.PackageError) as e: - raise GPUInstallError(f"failed to install packages {install_packages}. reason: {e}") + raise GPUOpsError(f"failed to install packages {install_packages}. reason: {e}") -def get_gpus() -> dict: +def get_all_gpu() -> dict[str, set[int]]: """Get the GPU devices on this node. Returns: @@ -119,14 +113,12 @@ def get_gpus() -> dict: """ gpu_info = {} - # Return immediately if pynvml not installed... try: pynvml = _import("pynvml") except ModuleNotFoundError: _logger.info("cannot gather GPU info: pynvml module not installed") return gpu_info - # ...or Nvidia drivers not loaded. try: pynvml.nvmlInit() except pynvml.NVMLError as e: @@ -135,7 +127,6 @@ def get_gpus() -> dict: return gpu_info gpu_count = pynvml.nvmlDeviceGetCount() - # Loop over all detected GPUs, gathering info by model. for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) @@ -146,15 +137,8 @@ def get_gpus() -> dict: model = pynvml.nvmlDeviceGetName(handle) model = "_".join(model.split()).lower() - # Number for device path, e.g. if device is /dev/nvidia0, returns 0 minor_number = pynvml.nvmlDeviceGetMinorNumber(handle) - - try: - # Add minor number to set of existing numbers for this model. - gpu_info[model].add(minor_number) - except KeyError: - # This is the first time we've seen this model. Create a new entry. - gpu_info[model] = {minor_number} + gpu_info[model] = gpu_info.get(model, set()) | {minor_number} pynvml.nvmlShutdown() return gpu_info diff --git a/charms/slurmd/src/utils/machine.py b/charms/slurmd/src/utils/machine.py index b448e84..0e9f381 100644 --- a/charms/slurmd/src/utils/machine.py +++ b/charms/slurmd/src/utils/machine.py @@ -16,13 +16,16 @@ import logging import subprocess -from typing import Any, Dict _logger = logging.getLogger(__name__) -def get_slurmd_info() -> Dict[str, Any]: - """Get machine info as reported by `slurmd -C`.""" +def get_slurmd_info() -> dict[str, str | list[str]]: + """Get machine info as reported by `slurmd -C`. + + For details see: + https://slurm.schedmd.com/slurmd.html + """ try: r = subprocess.check_output(["slurmd", "-C"], text=True).strip() except subprocess.CalledProcessError as e: @@ -31,10 +34,10 @@ def get_slurmd_info() -> Dict[str, Any]: info = {} for opt in r.split()[:-1]: - key, value = opt.split("=") - # Split comma-separated lists, e.g. Gres=gpu:model_a:1,gpu:model_b:1 - if "," in value: - info[key] = value.split(",") - else: - info[key] = value + k, v = opt.split("=") + if k == "Gres": + info[k] = v.split(",") + continue + + info[k] = v return info