From 83855eb683c40ff71990bc43722f71768ee7cf5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Mon, 9 Sep 2024 17:30:43 -0600 Subject: [PATCH] feat(slurmd): implement option to install ROCm drivers Leaving this as an option that can be set by the user for now. We can investigate in the future how to automatically install the drivers. --- charms/slurmd/charmcraft.yaml | 14 ++++++ charms/slurmd/src/charm.py | 12 ++++- charms/slurmd/src/constants.py | 55 ++++++++++++++++++++++ charms/slurmd/src/slurmd_ops.py | 81 ++++++++++++++++++++++++++++++++- 4 files changed, 160 insertions(+), 2 deletions(-) diff --git a/charms/slurmd/charmcraft.yaml b/charms/slurmd/charmcraft.yaml index a3f5a00..8de70b8 100644 --- a/charms/slurmd/charmcraft.yaml +++ b/charms/slurmd/charmcraft.yaml @@ -78,6 +78,20 @@ config: $ juju config slurmd nhc-conf="$(cat extra-nhc.conf)" ``` + gpu: + default: "" + type: string + description: > + Type of GPU driver to install. + + Available options: [`amd`]. + + Example usage: + ```bash + $ juju config slurmd gpu=amd + ``` + + actions: node-configured: description: Remove a node from DownNodes when the reason is `New node`. diff --git a/charms/slurmd/src/charm.py b/charms/slurmd/src/charm.py index 72f208f..a5907a1 100755 --- a/charms/slurmd/src/charm.py +++ b/charms/slurmd/src/charm.py @@ -93,8 +93,18 @@ def _on_install(self, event: InstallEvent) -> None: self._check_status() - def _on_config_changed(self, event: ConfigChangedEvent) -> None: + def _on_config_changed(self, event: ConfigChangedEvent) -> None: # noqa: C901 """Handle charm configuration changes.""" + if gpu := self.model.config.get("gpu"): + if gpu == "amd": + if not self._slurmd_manager.rocm_manager.is_installed(): + self._slurmd_manager.rocm_manager.install() + self.unit.reboot(now=True) + self._slurmd_manager.rocm_manager.post_install() + else: + self.unit.status = BlockedStatus("Invalid value for option `gpu`") + return + if nhc_conf := self.model.config.get("nhc-conf"): if nhc_conf != self._stored.nhc_conf: self._stored.nhc_conf = nhc_conf diff --git a/charms/slurmd/src/constants.py b/charms/slurmd/src/constants.py index 21b428f..4973f48 100644 --- a/charms/slurmd/src/constants.py +++ b/charms/slurmd/src/constants.py @@ -40,3 +40,58 @@ =cs1s -----END PGP PUBLIC KEY BLOCK----- """ + +ROCM_PPA_KEY = """ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1 + +mQINBFefsSABEADmVqQyRi5bcUs/eG8mnKLdY+V+xuKuHLuujlXinSaMFRO640Md +C2HNYLSd58Z8cB1rKfiN639CZp+SkDWq60cFXDCcX9djT0JmBzsTD/gwoMr16tMY +O+Z2mje2pEYgDJdmYrephhXn29BfebW1IQKdA+4C7l675mJ/T8yVMUNXC0hqfGDA +h1MJUQy/lz1S2fGdjCKX0PiYOnCOyhNa7aTpw9PkZWgEa/s4BhplFZxvLohrCcf6 +ks0gUITHfeEhJvj2KurRfL68DgFifGnG+/fsMHgW1Xp19GsnIVaoh6cV7/iFHhrb +6YHI1fdOq/mwOfG8mJnXmDXC/o24Q7mRRwvoJcsT0j+thRirs8trV01mKY+7Hxd2 +CamWttibo062pjWN2aEUMPmEU2kmGOupsZtlpqn6SGCd2+6maOPMNEq/F0EWxhul +q6mgezVb8pvJ3bwvph2/lMSgfT9fHs6UIh4i/3rnA5/JaejFonlnS9xEuglKjklj +UoikSPBOwjvoPW2u99WCflURFSXVvuk7Ci+XkbVPIZyD6gFJjeY02Ic5MAv5tj/z +0fpgr/CfwEllms+z7qz768xRweA0kmPTTARdufVTna6EV3K3njxvCIIfnrp1cF6S +e3VrREd98gO0Rmzy74UFqkXl9Tb/+UILx1qVRmOBinwacKGqzo+k9jPUKQARAQAB +tChBTUQgTUxTRSBEZXZPcHMgPGRsLk1MU0UuRGV2T3BzQGFtZC5jb20+iQI+BBMB +AgAoAhsDBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAUCYfuRkwUJE8Hh5wAKCRCT +hrSKGmk8XI1AEACSJLVGHCLJOOKz9fbUR4KWl7Gpv0RWccwxhH01jNZTSXUCEnKA +2KYmaqFvrT5szxWILobmCNYtAlbdkpUfb0mMaF3UtTu+1UMOw2ExzxHw1FyA+z6d +vLqDKXLldsOFUfojDUhD5cK6uvONPc1orCf/4ve6wnRG838bAzb4VrFR64IxfPjx +NukH+jo2nEXNpnNv44DEiq65CcObaPuwAVBFnRYD/ByPO4ZArxFXqNzHRxpoZkKv +iwzhbPG4cirioqzRR9y2SsC+a2sO4a/jH0wOL2+n4L86xShYcuCBxXvS/AwrV/aO +JxKOfAUV4VQegAOQz64L+iz7PslNSTILJGdvGcC5Ckgpo6evdWBT7KdGXhzf4S1f +wZjYyP9sfQa7LxqyrkLHZqYt4If4Jmukx7cApBYp1nPnuCQrLU6D4Arq0ZVWQuNV +hbABLeqwdVQcX+vG/Kr/ZC+Vkv3Z8oElwVGAAQ6HNXr/u8ud2bu6iNJ5mcQbM1HD +KTNt5LUrk0p588a8dk0/TyC5xeKSv51iNL+aOVaTr0pRwgaHtEVar2i0FPC1mkr4 +1hhIDddx8WLoUt/52f1juyr/4CpL1M5f1cbMVjV6i0kqIEx/hxrryc+fZZQT5R4M +vysxcsh8ttgpABG5vzz2rLOCanmQ4eDdmlugzn/u0ngoDdnC0gEfnVVutLkCDQRX +n7EgARAAlsWVKSOQicuBxBlo3U5tre5whSyAOWHuy6/heGwCkGssTahbIL8pRwOL +5nKJCPCKKJ4YYoZ+Jzer9WTsDRZU/zpQXK9C5WdfF6DN/Fai3lqhgeDDVyF0hUDr +NQigm/w66JEYTGtMcC5PnYv7S6Zrn9WN4anv9n5thNwfsqxpbbg6sAQ2aLHLsW96 +myQE9v1s0YoSZYc7rFYBwszE+tFX0kLlyBYSRVns/USQifu66RObO706d8DHp6Ro +vO6WgsTu+0RR2FEUabBx1q6iKe1cqK0FYtWd8tXCpqQBm0zGC6UwTp4Z4GMCX2Pk +3xAMmrItW5kPKCANB+P/8ZoOoZLIX5Fr9axQ496lUh0ZDhOACewJfj9Szk9GN5rq ++2QKnRepatevGBVaN0lCAEwg2q9/9xmrT6CixFrbnw2T6mWHM3jQrvduqmC0c1Cd +uMZBGDKSpjouaN0UKtC+udwWiY7w452pcjCnUjzjk7tR1IarSCnLLYeb+MDCK83M +CFH60SmBfdqjRiTiLas34KSKNnmbfUfrTYswf0Oed/qXAUSlYOCmWl4sV8n+Ebpy +XfY80/fzu95RbpMEZMhUTRtvr64O5jaWM/lFnubnegGTW3Bk/fBR2VRsBx56ZHlc +JH23f6IREjQ1x4B2UsINYfyYpmzb+R4qpMzycBVHv9ipiYQsQ8sAEQEAAYkCJQQY +AQoADwIbDAUCYfuRtwUJE8HiEAAKCRCThrSKGmk8XMAcEACd0jYXjnu7qoEY4U9Q +47X2SeJmWsuTavCrU5AWxjYwWd0mtDqK8EynxDPq7UFs+8+OukqrE++p0bfBbDl9 +TwnwmSSdizAZriHMSgeg9GR5KVL4mreNhFQdk/6mTFdlRhi5s7ZuvPayLSMIAWaj +ET5gFMeO1B/ABSpaKEZwQjRcXrto/hCUJ++7qoosblhcgwX7fiqZZbMxcoCEQIQQ +7ZasLxpVtaeDVfetp2zO5F0/e3D/sNbvBrlDofSt6D5V2cmKjLqONFVc6JrzSNeK +k9Gn8UVzAKfRfLaQyDaoFV0MbBf3q111UQQPkvwZYp0lPT6t2/G8zoubwFhHsM31 +K5ZBbt0384hI9RJITo9/krXVXLYFeCLcoPKn/fGWgAwyYAYr6C7JcocxTNUyCd1I +AVg4SO/JuC3NWFQK5LhknN/gJkFlLZdB2cWqu9dDIkx1cHXThaM2n/7GSxv7fzrI +Br1jhZjUPWJ2iOd8iHgVEkIEvZql8z+huSxcNemodEN1emmUUoIyY3Fh0lJmozDt +ZPATk3iPpksOApsDVhWXP96RjTYEozYCxgTxCnk+kX/iJIlt53BPNWm9HMTcmtDI +v3s7OEcw0DN3U2VKcL9Q4Sg3uNfhwQsw/xBJaxAHQn5lN/8t0eLt+U653ooEEr0o +ta5TfPumStSQ1UjP8pPny4l+JQ== +=UOE+ +-----END PGP PUBLIC KEY BLOCK----- +""" diff --git a/charms/slurmd/src/slurmd_ops.py b/charms/slurmd/src/slurmd_ops.py index 3f1afc5..6a0f3b8 100644 --- a/charms/slurmd/src/slurmd_ops.py +++ b/charms/slurmd/src/slurmd_ops.py @@ -15,7 +15,7 @@ from typing import Any, Dict import distro -from constants import MUNGE_KEY_PATH, SLURM_GROUP, SLURM_USER, UBUNTU_HPC_PPA_KEY +from constants import MUNGE_KEY_PATH, ROCM_PPA_KEY, SLURM_GROUP, SLURM_USER, UBUNTU_HPC_PPA_KEY import charms.operator_libs_linux.v0.apt as apt # type: ignore [import-untyped] import charms.operator_libs_linux.v1.systemd as systemd # type: ignore [import-untyped] @@ -130,6 +130,84 @@ def install(self) -> bool: return package_installed +class ROCmPackagesLifecycleManager: + """Facilitate ROCm packages lifecycles.""" + + _LD_FILE = Path("/etc/ld.so.conf.d/rocm.conf") + _PATH_FILE = Path("PATH=$PATH:/opt/rocm-6.2.0/bin") + + def install(self) -> None: + """Install ROCm required packages. + + NOTE: The machine will require a restart after installation to enable the `amdgpu-dkms` module. + + Raises: + SlurmdException if the installation failed. + """ + try: + arch = subprocess.check_output(["dpkg", "--print-architecture"], text=True).strip() + except subprocess.CalledProcessError as e: + raise SlurmdException( + f"failed to determine the system architecture. reason: {e.stderr}" + ) + + amdgpu_repo = apt.DebianRepository( + enabled=True, + repotype="deb", + uri="https://repo.radeon.com/amdgpu/6.2/ubuntu", + release=distro.codename(), + groups=["main"], + options={"arch": arch}, + ) + amdgpu_repo.import_key(ROCM_PPA_KEY) + + rocm_repo = apt.DebianRepository( + enabled=True, + repotype="deb", + uri="https://repo.radeon.com/rocm/apt/6.2", + release=distro.codename(), + groups=["main"], + options={"arch": arch}, + ) + rocm_repo.import_key(ROCM_PPA_KEY) + + repositories = apt.RepositoryMapping() + repositories.add(amdgpu_repo) + repositories.add(rocm_repo) + + apt.update() + apt.add_package(["amdgpu-dkms", "rocm"]) + + def is_installed(self) -> bool: + """Check if the ROCm packages have been installed.""" + try: + for package in ["amdgpu-dkms", "rocm"]: + apt.DebianPackage.from_installed_package(package) + except apt.PackageNotFoundError: + return False + + return True + + def post_install(self) -> None: + """Execute post-installation steps for the ROCm packages. + + Raises: + SlurmdException if the post-installation steps failed. + """ + self._LD_FILE.parent.mkdir(parents=True, exist_ok=True) + self._LD_FILE.write_text("/opt/rocm/lib\n/opt/rocm/lib64") + + try: + subprocess.check_call(["ldconfig"], text=True) + except subprocess.CalledProcessError as e: + raise SlurmdException( + f"could not execute the post-installation step. reason: {e.stdout}" + ) + + self._PATH_FILE.parent.mkdir(parents=True, exist_ok=True) + self._PATH_FILE.write_text("PATH=$PATH:/opt/rocm-6.2.0/bin") + + class SlurmdManager: """SlurmdManager.""" @@ -138,6 +216,7 @@ def __init__(self): self._slurmd_package = CharmedHPCPackageLifecycleManager("slurmd") self._slurm_client_package = CharmedHPCPackageLifecycleManager("slurm-client") self._common_packages = CommonPackagesLifecycleManager() + self.rocm_manager = ROCmPackagesLifecycleManager() def install(self) -> bool: """Install slurmd, slurm-client, munge, and common packages to the system."""