From 519d9b5759f3fb979e9196fa2b54a5149173f996 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Fri, 12 Jul 2024 21:39:38 -0400 Subject: [PATCH 1/3] fix: ensure that nhc tarball is properly primed within charm Needed to add the part `charm: {}` to make the charm pack correctly. jedel1043 and I found that this part declaration must be included in `charmcraft.yaml`, or it will fail to pack the charm correctly. nhc will be there, but the charm won't :'( Signed-off-by: Jason C. Nucciarone --- charmcraft.yaml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index dac7bdb..1af8322 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -34,12 +34,16 @@ bases: architectures: [amd64] parts: - charm: + charm: {} + nhc: + plugin: nil build-packages: - wget + override-pull: | + wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz override-build: | - wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz - craftctl default + install -m644 -D -t $CRAFT_PART_INSTALL lbnl-nhc-1.4.3.tar.gz + craftctl default provides: slurmctld: From f0102f2d3f4ca252860cdee8a397183fab56d2d5 Mon Sep 17 00:00:00 2001 From: "Jason C. Nucciarone" Date: Fri, 12 Jul 2024 21:44:21 -0400 Subject: [PATCH 2/3] fix: handle if tar fails to extract contents nhc tarball Previously, if tar failed to extract the contents of the nhc tarball to `/tmp/nhc`, _install_nhc_from_tarball would throw an unhandled excepting that would cause the charm to bork. Now we catch if tar fails to extract the contents of the tarball, log the error output for the asministrator to read, and return False so that the slurmd operator can properly handle and install failure. Signed-off-by: Jason C. Nucciarone --- src/slurmd_ops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/slurmd_ops.py b/src/slurmd_ops.py index dcab81b..7b3a1b8 100644 --- a/src/slurmd_ops.py +++ b/src/slurmd_ops.py @@ -173,7 +173,12 @@ def _install_nhc_from_tarball(self) -> bool: base_path.mkdir() cmd = f"tar --extract --directory {base_path} --file lbnl-nhc-1.4.3.tar.gz".split() - subprocess.run(cmd) + try: + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT, text=True) + logger.debug(result) + except subprocess.CalledProcessError as e: + logger.error("failed to extract NHC using tar. reason:\n%s", e.stdout) + return False full_path = base_path / os.listdir(base_path)[0] From 14b39fca713495eb7dd432f9b7371dcff8af4e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Thu, 11 Jul 2024 13:00:46 -0600 Subject: [PATCH 3/3] feat: replace ppa with the Slurm snap This also transitions our systemd notices to a new version, since the old version doesn't support services with dots on their names. --- lib/charms/hpc_libs/v0/slurm_ops.py | 280 ++++++++++++++++++ .../v0/juju_systemd_notices.py | 137 ++++++--- src/charm.py | 15 +- src/constants.py | 35 +-- src/slurmd_ops.py | 133 ++------- src/templates/override.conf | 3 - src/utils/slurmd.py | 15 - 7 files changed, 410 insertions(+), 208 deletions(-) create mode 100644 lib/charms/hpc_libs/v0/slurm_ops.py delete mode 100644 src/templates/override.conf diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py new file mode 100644 index 0000000..d49a38a --- /dev/null +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -0,0 +1,280 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Abstractions for managing Slurm operations via snap. + +This library contains the `SlurmManagerBase` and `ServiceType` class +which provide high-level interfaces for managing Slurm within charmed operators. + +### Example Usage + +#### Managing a Slurm service + +The `SlurmManagerBase` constructor receives a `ServiceType` enum. The enum instructs +the inheriting Slurm service manager how to manage its corresponding Slurm service on the host. + +```python3 +import charms.hpc_libs.v0.slurm_ops as slurm +from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType + +class SlurmctldManager(SlurmManagerBase): + # Manage `slurmctld` service on host. + + def __init__(self) -> None: + super().__init__(ServiceType.SLURMCTLD) + + +class ApplicationCharm(CharmBase): + # Application charm that needs to use the Slurm snap. + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + self._slurm_manager = SlurmctldManager() + self.framework.observe( + self.on.install, + self._on_install, + ) + + def _on_install(self, _) -> None: + slurm.install() + self.unit.set_workload_version(slurm.version()) + self._slurm_manager.config.set({"cluster-name": "cluster"}) +``` +""" + +__all__ = [ + "format_key", + "install", + "version", + "ConfigurationManager", + "ServiceType", + "SlurmManagerBase", +] + +import json +import logging +import re +import subprocess +from collections.abc import Mapping +from enum import Enum +from typing import Any, Optional + +import yaml + +# The unique Charmhub library identifier, never change it +LIBID = "541fd767f90b40539cf7cd6e7db8fabf" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 3 + +# Charm library dependencies to fetch during `charmcraft pack`. +PYDEPS = ["pyyaml>=6.0.1"] + +_logger = logging.getLogger(__name__) +_acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])") +_kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])") + + +class SlurmOpsError(Exception): + """Exception raised when a slurm operation failed.""" + + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] + + +def format_key(key: str) -> str: + """Format Slurm configuration keys from SlurmCASe into kebab case. + + Args: + key: Slurm configuration key to convert to kebab case. + + Notes: + Slurm configuration syntax does not follow proper PascalCasing + format, so we cannot put keys directly through a kebab case converter + to get the desired format. Some additional processing is needed for + certain keys before the key can properly kebabized. + + For example, without additional preprocessing, the key `CPUs` will + become `cp-us` if put through a kebabizer with being preformatted to `Cpus`. + """ + if "CPUs" in key: + key = key.replace("CPUs", "Cpus") + key = _acronym.sub(r"-", key) + return _kebabize.sub(r"-", key).lower() + + +def install() -> None: + """Install Slurm.""" + # FIXME: Pin slurm to the stable channel + _snap("install", "slurm", "--channel", "latest/candidate", "--classic") + + +def version() -> str: + """Get the current version of Slurm installed on the system.""" + info = yaml.safe_load(_snap("info", "slurm")) + ver: str = info["installed"] + return ver.split(maxsplit=1)[0] + + +def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str: + """Call a command with logging. + + Raises: + SlurmOpsError: Raised if the command fails. + """ + cmd = [cmd, *args] + _logger.debug(f"Executing command {cmd}") + try: + return subprocess.check_output(cmd, input=stdin, stderr=subprocess.PIPE, text=True).strip() + except subprocess.CalledProcessError as e: + _logger.error(f"`{' '.join(cmd)}` failed") + _logger.error(f"stderr: {e.stderr.decode()}") + raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}") + + +def _snap(*args) -> str: + """Control snap by via executed `snap ...` commands. + + Raises: + subprocess.CalledProcessError: Raised if snap command fails. + """ + return _call("snap", *args) + + +def _mungectl(*args: str, stdin: Optional[str] = None) -> str: + """Control munge via `slurm.mungectl ...`. + + Args: + *args: Arguments to pass to `mungectl`. + stdin: Input to pass to `mungectl` via stdin. + + Raises: + subprocess.CalledProcessError: Raised if `mungectl` command fails. + """ + return _call("slurm.mungectl", *args, stdin=stdin) + + +class ServiceType(Enum): + """Type of Slurm service to manage.""" + + MUNGED = "munged" + SLURMD = "slurmd" + SLURMCTLD = "slurmctld" + SLURMDBD = "slurmdbd" + SLURMRESTD = "slurmrestd" + + @property + def config_name(self) -> str: + """Configuration name on the slurm snap for this service type.""" + if self is ServiceType.SLURMCTLD: + return "slurm" + if self is ServiceType.MUNGED: + return "munge" + + return self.value + + +class ServiceManager: + """Control a Slurm service.""" + + def enable(self) -> None: + """Enable service.""" + _snap("start", "--enable", f"slurm.{self._service.value}") + + def disable(self) -> None: + """Disable service.""" + _snap("stop", "--disable", f"slurm.{self._service.value}") + + def restart(self) -> None: + """Restart service.""" + _snap("restart", f"slurm.{self._service.value}") + + +class ConfigurationManager: + """Control configuration of a Slurm component.""" + + def __init__(self, name: str) -> None: + self._name = name + + def get_options(self, *keys: str) -> Mapping[str, Any]: + """Get given configurations values for Slurm component.""" + configs = {} + for key in keys: + config = self.get(key) + target = key.rsplit(".", maxsplit=1)[-1] + configs[target] = config + + return configs + + def get(self, key: Optional[str] = None) -> Any: + """Get specific configuration value for Slurm component.""" + key = f"{self._name}.{key}" if key else self._name + config = json.loads(_snap("get", "-d", "slurm", key)) + return config[key] + + def set(self, config: Mapping[str, Any]) -> None: + """Set configuration for Slurm component.""" + args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()] + _snap("set", "slurm", *args) + + def unset(self, *keys: str) -> None: + """Unset configuration for Slurm component.""" + args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name] + _snap("unset", "slurm", *args) + + +class MungeManager(ServiceManager): + """Manage `munged` service operations.""" + + def __init__(self) -> None: + service = ServiceType.MUNGED + self._service = service + self.config = ConfigurationManager(service.config_name) + + def get_key(self) -> str: + """Get the current munge key. + + Returns: + The current munge key as a base64-encoded string. + """ + return _mungectl("key", "get") + + def set_key(self, key: str) -> None: + """Set a new munge key. + + Args: + key: A new, base64-encoded munge key. + """ + _mungectl("key", "set", stdin=key) + + def generate_key(self) -> None: + """Generate a new, cryptographically secure munge key.""" + _mungectl("key", "generate") + + +class SlurmManagerBase(ServiceManager): + """Base manager for Slurm services.""" + + def __init__(self, service: ServiceType) -> None: + self._service = service + self.config = ConfigurationManager(service.config_name) + self.munge = MungeManager() diff --git a/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py b/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py index 08157c9..024047e 100644 --- a/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py +++ b/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -# Copyright 2023 Canonical Ltd. +# Copyright 2023-2024 Canonical Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ ```python from charms.operator_libs_linux.v0.juju_systemd_notices import ( + Service, ServiceStartedEvent, ServiceStoppedEvent, SystemdNotices, @@ -41,7 +42,7 @@ def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) # Register services with charm. This adds the events to observe. - self._systemd_notices = SystemdNotices(self, ["slurmd"]) + self._systemd_notices = SystemdNotices(self, Service("snap.slurm.slurmd", alias="slurmd")) self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on.service_slurmd_started, self._on_slurmd_started) @@ -58,7 +59,7 @@ def _on_install(self, _: InstallEvent) -> None: def _on_start(self, _: StartEvent) -> None: # This will trigger the juju-systemd-notices daemon to # emit a `service-slurmd-started` event. - systemd.service_start("slurmd") + snap.slurmd.enable() def _on_stop(self, _: StopEvent) -> None: # To stop the juju-systemd-notices service running in the background. @@ -72,26 +73,27 @@ def _on_slurmd_started(self, _: ServiceStartedEvent) -> None: # This will trigger the juju-systemd-notices daemon to # emit a `service-slurmd-stopped` event. - systemd.service_stop("slurmd") + snap.slurmd.stop() def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: self.unit.status = BlockedStatus("slurmd not running") ``` """ -__all__ = ["ServiceStartedEvent", "ServiceStoppedEvent", "SystemdNotices"] +__all__ = ["Service", "ServiceStartedEvent", "ServiceStoppedEvent", "SystemdNotices"] import argparse import asyncio import functools import logging -import re import signal import subprocess import textwrap +from dataclasses import dataclass from pathlib import Path -from typing import List +from typing import Mapping, Optional +import yaml from dbus_fast.aio import MessageBus from dbus_fast.constants import BusType, MessageType from dbus_fast.errors import DBusError @@ -99,6 +101,10 @@ def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: from ops.charm import CharmBase from ops.framework import EventBase +# FIXME: This is a custom version of `juju-systemd-notices`. Upstream does not yet have +# patches for observing the state of snap services. Will sync with upstream again once +# gh:canonical/operator-libs-linux#128 lands against upstream. + # The unique Charmhub library identifier, never change it. LIBID = "2bb6ecd037e64c899033113abab02e01" @@ -111,12 +117,11 @@ def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: # juju-systemd-notices charm library dependencies. # Charm library dependencies are installed when the consuming charm is packed. -PYDEPS = ["dbus-fast>=1.90.2"] +PYDEPS = ["dbus-fast>=1.90.2", "pyyaml>=6.0.1"] _logger = logging.getLogger(__name__) _juju_unit = None _service_states = {} -_service_hook_regex_filter = re.compile(r"service-(?P[\w\\:-]*)-(?:started|stopped)") _DBUS_CHAR_MAPPINGS = { "_5f": "_", # _ must be first since char mappings contain _. "_40": "@", @@ -148,6 +153,22 @@ def _systemctl(*args) -> None: _disable_service = functools.partial(_systemctl, "disable") +@dataclass +class Service: + """Systemd service to observe. + + Args: + name: Name of systemd service to observe on dbus. + alias: Event name alias for service. + """ + + name: str + alias: Optional[str] = None + + def __post_init__(self) -> None: # noqa D105 + self.alias = self.alias or self.name + + class ServiceStartedEvent(EventBase): """Event emitted when service has started.""" @@ -159,7 +180,7 @@ class ServiceStoppedEvent(EventBase): class SystemdNotices: """Observe systemd services on your machine base.""" - def __init__(self, charm: CharmBase, services: List[str]) -> None: + def __init__(self, charm: CharmBase, *services: Service) -> None: """Instantiate systemd notices service.""" self._charm = charm self._services = services @@ -170,39 +191,65 @@ def __init__(self, charm: CharmBase, services: List[str]) -> None: "Attaching systemd notice events to charm %s", self._charm.__class__.__name__ ) for service in self._services: - self._charm.on.define_event(f"service_{service}_started", ServiceStartedEvent) - self._charm.on.define_event(f"service_{service}_stopped", ServiceStoppedEvent) + self._charm.on.define_event(f"service_{service.alias}_started", ServiceStartedEvent) + self._charm.on.define_event(f"service_{service.alias}_stopped", ServiceStoppedEvent) def subscribe(self) -> None: """Subscribe charmed operator to observe status of systemd services.""" + self._generate_hooks() + self._generate_config() + self._start() + + def stop(self) -> None: + """Stop charmed operator from observing the status of subscribed services.""" + _stop_service(self._service_file.name) + # Notices daemon is disabled so that the service will not restart after machine reboot. + _disable_service(self._service_file.name) + + def _generate_hooks(self) -> None: + """Generate legacy event hooks for observed systemd services.""" _logger.debug("Generating systemd notice hooks for %s", self._services) - start_hooks = [Path(f"hooks/service-{service}-started") for service in self._services] - stop_hooks = [Path(f"hooks/service-{service}-stopped") for service in self._services] + start_hooks = [Path(f"hooks/service-{s.alias}-started") for s in self._services] + stop_hooks = [Path(f"hooks/service-{s.alias}-stopped") for s in self._services] for hook in start_hooks + stop_hooks: if hook.exists(): _logger.debug("Hook %s already exists. Skipping...", hook.name) else: hook.symlink_to(self._charm.framework.charm_dir / "dispatch") + def _generate_config(self) -> None: + """Generate watch file for systemd notices daemon.""" + _logger.debug("Generating watch file for %s", self._services) + config = {"services": {s.name: s.alias for s in self._services}} + + config_file = self._charm.framework.charm_dir / "watch.yaml" + if config_file.exists(): + _logger.debug("Overwriting existing watch file %s", config_file.name) + with config_file.open("wt") as fout: + yaml.dump(config, fout) + config_file.chmod(0o600) + + def _start(self) -> None: + """Start systemd notices daemon to observe subscribed services.""" _logger.debug("Starting %s daemon", self._service_file.name) if self._service_file.exists(): _logger.debug("Overwriting existing service file %s", self._service_file.name) self._service_file.write_text( textwrap.dedent( f""" - [Unit] - Description=Juju systemd notices daemon - After=multi-user.target - - [Service] - Type=simple - Restart=always - WorkingDirectory={self._charm.framework.charm_dir} - Environment="PYTHONPATH={self._charm.framework.charm_dir / "venv"}" - ExecStart=/usr/bin/python3 {__file__} {self._charm.unit.name} - - [Install] - WantedBy=multi-user.target + [Unit] + Description=Juju systemd notices daemon + After=multi-user.target + + [Service] + Type=simple + Restart=always + WorkingDirectory={self._charm.framework.charm_dir} + Environment="PYTHONPATH={self._charm.framework.charm_dir / "venv"}" + ExecStart=/usr/bin/python3 {__file__} {self._charm.unit.name} + + [Install] + WantedBy=multi-user.target """ ).strip() ) @@ -214,12 +261,6 @@ def subscribe(self) -> None: _start_service(self._service_file.name) _logger.debug("Started %s daemon", self._service_file.name) - def stop(self) -> None: - """Stop charmed operator from observing the status of subscribed services.""" - _stop_service(self._service_file.name) - # Notices daemon is disabled so that the service will not restart after machine reboot. - _disable_service(self._service_file.name) - def _name_to_dbus_path(name: str) -> str: """Convert the specified name into an org.freedesktop.systemd1.Unit path handle. @@ -256,6 +297,16 @@ def _dbus_path_to_name(path: str) -> str: return name +@functools.lru_cache(maxsize=32) +def _read_config() -> Mapping[str, str]: + """Read systemd notices daemon configuration to service names and aliases.""" + config_file = Path.cwd() / "watch.yaml" + _logger.debug("Loading observed services from configuration file %s", config_file) + + with config_file.open("rt") as fin: + return yaml.safe_load(fin)["services"] + + def _systemd_unit_changed(msg: Message) -> bool: """Send Juju notification if systemd unit state changes on the DBus bus. @@ -310,8 +361,10 @@ async def _send_juju_notification(service: str, state: str) -> None: if service.endswith(".service"): service = service[0:-len(".service")] # fmt: skip + watched_services = _read_config() + alias = watched_services[service] event_name = "started" if state == "active" else "stopped" - hook = f"service-{service}-{event_name}" + hook = f"service-{alias}-{event_name}" cmd = ["/usr/bin/juju-exec", _juju_unit, f"hooks/{hook}"] _logger.debug("Invoking hook %s with command: %s", hook, " ".join(cmd)) @@ -364,20 +417,8 @@ async def _async_load_services() -> None: will be queried from systemd to determine it's initial state. """ global _juju_unit - hooks_dir = Path.cwd() / "hooks" - _logger.info("Loading services from hooks in %s", hooks_dir) - - if not hooks_dir.exists(): - _logger.warning("Hooks dir %s does not exist.", hooks_dir) - return - - watched_services = [] - # Get service-{service}-(started|stopped) hooks defined by the charm. - for hook in hooks_dir.iterdir(): - match = _service_hook_regex_filter.match(hook.name) - if match: - watched_services.append(match.group("service")) + watched_services = _read_config() _logger.info("Services from hooks are %s", watched_services) if not watched_services: return @@ -386,7 +427,7 @@ async def _async_load_services() -> None: # Loop through all the services and be sure that a new watcher is # started for new ones. - for service in watched_services: + for service in watched_services.keys(): # The .service suffix is necessary and will cause lookup failures of the # service unit when readying the watcher if absent from the service name. service = f"{service}.service" diff --git a/src/charm.py b/src/charm.py index c004fcf..dd27a40 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,6 +10,7 @@ from typing import Any, Dict from charms.operator_libs_linux.v0.juju_systemd_notices import ( # type: ignore[import-untyped] + Service, ServiceStartedEvent, ServiceStoppedEvent, SystemdNotices, @@ -32,7 +33,6 @@ ) from slurm_conf_editor import Node, Partition from slurmd_ops import SlurmdManager -from utils import slurmd logger = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def __init__(self, *args, **kwargs): self._slurmd_manager = SlurmdManager() self._slurmctld = Slurmctld(self, "slurmctld") - self._systemd_notices = SystemdNotices(self, ["slurmd"]) + self._systemd_notices = SystemdNotices(self, Service("snap.slurm.slurmd", "slurmd")) event_handler_bindings = { self.on.install: self._on_install, @@ -82,7 +82,6 @@ def _on_install(self, event: InstallEvent) -> None: if self._slurmd_manager.install(): self.unit.set_workload_version(self._slurmd_manager.version()) - slurmd.override_service() self._systemd_notices.subscribe() self._stored.slurm_installed = True @@ -142,7 +141,7 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: if (slurmctld_host := event.slurmctld_host) != self._stored.slurmctld_host: if slurmctld_host is not None: - slurmd.override_default(slurmctld_host) + self._slurmd_manager.set_conf_server(slurmctld_host) self._stored.slurmctld_host = slurmctld_host logger.debug(f"slurmctld_host={slurmctld_host}") else: @@ -178,7 +177,8 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: else: logger.error("## Unable to restart munge") - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() self._check_status() def _on_slurmctld_unavailable(self, event) -> None: @@ -188,7 +188,7 @@ def _on_slurmctld_unavailable(self, event) -> None: self._stored.nhc_params = "" self._stored.munge_key = "" self._stored.slurmctld_host = "" - slurmd.stop() + self._slurmd_manager._manager.disable() self._check_status() def _on_slurmd_started(self, _: ServiceStartedEvent) -> None: @@ -204,7 +204,8 @@ def _on_node_configured_action(self, _: ActionEvent) -> None: # Trigger reconfiguration of slurmd node. self._new_node = False self._slurmctld.set_node() - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() logger.debug("### This node is not new anymore") def _on_show_nhc_config(self, event: ActionEvent) -> None: diff --git a/src/constants.py b/src/constants.py index 21b428f..6eec976 100644 --- a/src/constants.py +++ b/src/constants.py @@ -6,37 +6,4 @@ SLURM_USER = "root" SLURM_GROUP = "root" -MUNGE_KEY_PATH = Path("/etc/munge/munge.key") - -UBUNTU_HPC_PPA_KEY = """ ------BEGIN PGP PUBLIC KEY BLOCK----- -Comment: Hostname: -Version: Hockeypuck 2.1.1-10-gec3b0e7 - -xsFNBGTuZb8BEACtJ1CnZe6/hv84DceHv+a54y3Pqq0gqED0xhTKnbj/E2ByJpmT -NlDNkpeITwPAAN1e3824Me76Qn31RkogTMoPJ2o2XfG253RXd67MPxYhfKTJcnM3 -CEkmeI4u2Lynh3O6RQ08nAFS2AGTeFVFH2GPNWrfOsGZW03Jas85TZ0k7LXVHiBs -W6qonbsFJhshvwC3SryG4XYT+z/+35x5fus4rPtMrrEOD65hij7EtQNaE8owuAju -Kcd0m2b+crMXNcllWFWmYMV0VjksQvYD7jwGrWeKs+EeHgU8ZuqaIP4pYHvoQjag -umqnH9Qsaq5NAXiuAIAGDIIV4RdAfQIR4opGaVgIFJdvoSwYe3oh2JlrLPBlyxyY -dayDifd3X8jxq6/oAuyH1h5K/QLs46jLSR8fUbG98SCHlRmvozTuWGk+e07ALtGe -sGv78ToHKwoM2buXaTTHMwYwu7Rx8LZ4bZPHdersN1VW/m9yn1n5hMzwbFKy2s6/ -D4Q2ZBsqlN+5aW2q0IUmO+m0GhcdaDv8U7RVto1cWWPr50HhiCi7Yvei1qZiD9jq -57oYZVqTUNCTPxi6NeTOdEc+YqNynWNArx4PHh38LT0bqKtlZCGHNfoAJLPVYhbB -b2AHj9edYtHU9AAFSIy+HstET6P0UDxy02IeyE2yxoUBqdlXyv6FL44E+wARAQAB -zRxMYXVuY2hwYWQgUFBBIGZvciBVYnVudHUgSFBDwsGOBBMBCgA4FiEErocSHcPk -oLD4H/Aj9tDF1ca+s3sFAmTuZb8CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AA -CgkQ9tDF1ca+s3sz3w//RNawsgydrutcbKf0yphDhzWS53wgfrs2KF1KgB0u/H+u -6Kn2C6jrVM0vuY4NKpbEPCduOj21pTCepL6PoCLv++tICOLVok5wY7Zn3WQFq0js -Iy1wO5t3kA1cTD/05v/qQVBGZ2j4DsJo33iMcQS5AjHvSr0nu7XSvDDEE3cQE55D -87vL7lgGjuTOikPh5FpCoS1gpemBfwm2Lbm4P8vGOA4/witRjGgfC1fv1idUnZLM -TbGrDlhVie8pX2kgB6yTYbJ3P3kpC1ZPpXSRWO/cQ8xoYpLBTXOOtqwZZUnxyzHh -gM+hv42vPTOnCo+apD97/VArsp59pDqEVoAtMTk72fdBqR+BB77g2hBkKESgQIEq -EiE1/TOISioMkE0AuUdaJ2ebyQXugSHHuBaqbEC47v8t5DVN5Qr9OriuzCuSDNFn -6SBHpahN9ZNi9w0A/Yh1+lFfpkVw2t04Q2LNuupqOpW+h3/62AeUqjUIAIrmfeML -IDRE2VdquYdIXKuhNvfpJYGdyvx/wAbiAeBWg0uPSepwTfTG59VPQmj0FtalkMnN -ya2212K5q68O5eXOfCnGeMvqIXxqzpdukxSZnLkgk40uFJnJVESd/CxHquqHPUDE -fy6i2AnB3kUI27D4HY2YSlXLSRbjiSxTfVwNCzDsIh7Czefsm6ITK2+cVWs0hNQ= -=cs1s ------END PGP PUBLIC KEY BLOCK----- -""" +SLURM_SNAP = Path("/snap/slurm/current") diff --git a/src/slurmd_ops.py b/src/slurmd_ops.py index 7b3a1b8..e3a4a99 100644 --- a/src/slurmd_ops.py +++ b/src/slurmd_ops.py @@ -7,17 +7,15 @@ import shlex import subprocess import textwrap -from base64 import b64decode from grp import getgrnam from pathlib import Path from pwd import getpwnam from shutil import rmtree from typing import Any, Dict -import charms.operator_libs_linux.v0.apt as apt # type: ignore [import-untyped] +import charms.hpc_libs.v0.slurm_ops as slurm import charms.operator_libs_linux.v1.systemd as systemd # type: ignore [import-untyped] -import distro -from constants import MUNGE_KEY_PATH, SLURM_GROUP, SLURM_USER, UBUNTU_HPC_PPA_KEY +from constants import SLURM_GROUP, SLURM_SNAP, SLURM_USER logger = logging.getLogger() @@ -39,101 +37,24 @@ def __init__(self, msg): pass -class CharmedHPCPackageLifecycleManager: - """Facilitate ubuntu-hpc slurm component package lifecycles.""" - - def __init__(self, package_name: str): - self._package_name = package_name - self._keyring_path = Path(f"/usr/share/keyrings/ubuntu-hpc-{self._package_name}.asc") - - def _repo(self) -> apt.DebianRepository: - """Return the ubuntu-hpc repo.""" - ppa_url = "https://ppa.launchpadcontent.net/ubuntu-hpc/slurm-wlm-23.02/ubuntu" - sources_list = f"deb [signed-by={self._keyring_path}] {ppa_url} {distro.codename()} main" - return apt.DebianRepository.from_repo_line(sources_list) - - def install(self) -> bool: - """Install package using lib apt.""" - package_installed = False - - if self._keyring_path.exists(): - self._keyring_path.unlink() - self._keyring_path.write_text(UBUNTU_HPC_PPA_KEY) - - repositories = apt.RepositoryMapping() - repositories.add(self._repo()) - - try: - apt.update() - apt.add_package([self._package_name]) - package_installed = True - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - return package_installed - - def uninstall(self) -> None: - """Uninstall the package using libapt.""" - if apt.remove_package(self._package_name): - logger.info(f"'{self._package_name}' removed from system.") - else: - logger.error(f"'{self._package_name}' not found on system.") - - repositories = apt.RepositoryMapping() - repositories.disable(self._repo()) - - if self._keyring_path.exists(): - self._keyring_path.unlink() - - def upgrade_to_latest(self) -> None: - """Upgrade package to latest.""" - try: - slurm_package = apt.DebianPackage.from_system(self._package_name) - slurm_package.ensure(apt.PackageState.Latest) - logger.info(f"Updated '{self._package_name}' to: {slurm_package.version.number}.") - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - def version(self) -> str: - """Return the package version.""" - slurm_package_vers = "" - try: - slurm_package_vers = apt.DebianPackage.from_installed_package( - self._package_name - ).version.number - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found on system.") - return slurm_package_vers - - class SlurmdManager: """SlurmdManager.""" def __init__(self): - self._munge_package = CharmedHPCPackageLifecycleManager("munge") - self._slurmd_package = CharmedHPCPackageLifecycleManager("slurmd") - self._slurm_client_package = CharmedHPCPackageLifecycleManager("slurm-client") + self._manager = slurm.SlurmManagerBase(slurm.ServiceType.SLURMD) def install(self) -> bool: """Install slurmd, slurm-client and munge packages to the system.""" - if self._slurmd_package.install() is not True: - logger.debug("Cannot install 'slurmd' package.") - return False + slurm.install() - systemd.service_stop("slurmd") + self._manager.disable() + self._manager.munge.disable() - if self._munge_package.install() is not True: - logger.debug("Cannot install 'munge' package.") - return False - - systemd.service_stop("munge") + os.symlink( + "/etc/systemd/system/snap.slurm.slurmd.service", "/etc/systemd/system/slurm.service" + ) - if self._slurm_client_package.install() is not True: - logger.debug("Cannot install 'slurm-client' package.") + if not systemd.daemon_reload(): return False if not self._install_nhc_from_tarball(): @@ -145,19 +66,15 @@ def install(self) -> bool: spool_dir = Path("/var/spool/slurmd") spool_dir.mkdir() - slurm_user_uid, slurm_group_gid = _get_slurm_user_uid_and_slurm_group_gid() - os.chown(f"{spool_dir}", slurm_user_uid, slurm_group_gid) - return True def version(self) -> str: """Return slurm version.""" - return self._slurmd_package.version() + return slurm.version() def write_munge_key(self, munge_key: str) -> None: """Base64 decode and write the munge key.""" - key = b64decode(munge_key.encode()) - MUNGE_KEY_PATH.write_bytes(key) + self._manager.munge.set_key(munge_key) def _install_nhc_from_tarball(self) -> bool: """Install NHC from tarball that is packaged with the charm. @@ -276,26 +193,30 @@ def restart_munged(self) -> bool: """ try: logger.debug("## Restarting munge") - systemd.service_restart("munge") - except SlurmdException("Cannot start munge.") as e: # type: ignore [misc] + self._manager.munge.enable() + self._manager.munge.restart() + except slurm.SlurmOpsError as e: # type: ignore [misc] logger.error(e) return False return self.check_munged() def check_munged(self) -> bool: """Check if munge is working correctly.""" - if not systemd.service_running("munge"): + if not systemd.service_running("snap.slurm.munged"): return False # check if munge is working, i.e., can use the credentials correctly try: logger.debug("## Testing if munge is working correctly") - cmd = "munge -n" + cmd = "slurm.munge -n" munge = subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE ) unmunge = subprocess.Popen( - ["unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["slurm.unmunge"], + stdin=munge.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) if munge is not None: munge.stdout.close() # type: ignore [union-attr] @@ -315,7 +236,9 @@ def get_node_config(self) -> Dict[Any, Any]: """Return the node configuration options as reported by slurmd -C.""" slurmd_config_options = "" try: - slurmd_config_options = subprocess.check_output(["slurmd", "-C"], text=True).strip() + slurmd_config_options = subprocess.check_output( + [SLURM_SNAP / "sbin" / "slurmd", "-C"], text=True + ).strip() except subprocess.CalledProcessError as e: logger.error(e) raise e @@ -332,3 +255,11 @@ def get_node_config(self) -> Dict[Any, Any]: raise e return slurmd_config_options_parsed + + def set_conf_server(self, server: str) -> None: + """Set the config server that provides the config file. + + Args: + server: Server hostname of the slurmctld service. + """ + self._manager.config.set({"config-server": server}) diff --git a/src/templates/override.conf b/src/templates/override.conf deleted file mode 100644 index d880806..0000000 --- a/src/templates/override.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Service] -LimitMEMLOCK=infinity -LimitNOFILE=1048576 diff --git a/src/utils/slurmd.py b/src/utils/slurmd.py index ab5ed2b..9a0c5e7 100644 --- a/src/utils/slurmd.py +++ b/src/utils/slurmd.py @@ -32,21 +32,6 @@ _logger = logging.getLogger(__name__) -def start() -> None: - """Start slurmd service.""" - systemd.service_start("slurmd") - - -def stop() -> None: - """Stop slurmd service.""" - systemd.service_stop("slurmd") - - -def restart() -> None: - """Restart slurmd service.""" - systemd.service_restart("slurmd") - - def override_default(host: str) -> None: """Override the /etc/default/slurmd file.