diff --git a/charmcraft.yaml b/charmcraft.yaml index dac7bdb..1af8322 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -34,12 +34,16 @@ bases: architectures: [amd64] parts: - charm: + charm: {} + nhc: + plugin: nil build-packages: - wget + override-pull: | + wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz override-build: | - wget https://github.com/mej/nhc/releases/download/1.4.3/lbnl-nhc-1.4.3.tar.gz - craftctl default + install -m644 -D -t $CRAFT_PART_INSTALL lbnl-nhc-1.4.3.tar.gz + craftctl default provides: slurmctld: diff --git a/lib/charms/hpc_libs/v0/slurm_ops.py b/lib/charms/hpc_libs/v0/slurm_ops.py new file mode 100644 index 0000000..d49a38a --- /dev/null +++ b/lib/charms/hpc_libs/v0/slurm_ops.py @@ -0,0 +1,280 @@ +# Copyright 2024 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""Abstractions for managing Slurm operations via snap. + +This library contains the `SlurmManagerBase` and `ServiceType` class +which provide high-level interfaces for managing Slurm within charmed operators. + +### Example Usage + +#### Managing a Slurm service + +The `SlurmManagerBase` constructor receives a `ServiceType` enum. The enum instructs +the inheriting Slurm service manager how to manage its corresponding Slurm service on the host. + +```python3 +import charms.hpc_libs.v0.slurm_ops as slurm +from charms.hpc_libs.v0.slurm_ops import SlurmManagerBase, ServiceType + +class SlurmctldManager(SlurmManagerBase): + # Manage `slurmctld` service on host. + + def __init__(self) -> None: + super().__init__(ServiceType.SLURMCTLD) + + +class ApplicationCharm(CharmBase): + # Application charm that needs to use the Slurm snap. + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + self._slurm_manager = SlurmctldManager() + self.framework.observe( + self.on.install, + self._on_install, + ) + + def _on_install(self, _) -> None: + slurm.install() + self.unit.set_workload_version(slurm.version()) + self._slurm_manager.config.set({"cluster-name": "cluster"}) +``` +""" + +__all__ = [ + "format_key", + "install", + "version", + "ConfigurationManager", + "ServiceType", + "SlurmManagerBase", +] + +import json +import logging +import re +import subprocess +from collections.abc import Mapping +from enum import Enum +from typing import Any, Optional + +import yaml + +# The unique Charmhub library identifier, never change it +LIBID = "541fd767f90b40539cf7cd6e7db8fabf" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 3 + +# Charm library dependencies to fetch during `charmcraft pack`. +PYDEPS = ["pyyaml>=6.0.1"] + +_logger = logging.getLogger(__name__) +_acronym = re.compile(r"(?<=[A-Z])(?=[A-Z][a-z])") +_kebabize = re.compile(r"(?<=[a-z0-9])(?=[A-Z])") + + +class SlurmOpsError(Exception): + """Exception raised when a slurm operation failed.""" + + @property + def message(self) -> str: + """Return message passed as argument to exception.""" + return self.args[0] + + +def format_key(key: str) -> str: + """Format Slurm configuration keys from SlurmCASe into kebab case. + + Args: + key: Slurm configuration key to convert to kebab case. + + Notes: + Slurm configuration syntax does not follow proper PascalCasing + format, so we cannot put keys directly through a kebab case converter + to get the desired format. Some additional processing is needed for + certain keys before the key can properly kebabized. + + For example, without additional preprocessing, the key `CPUs` will + become `cp-us` if put through a kebabizer with being preformatted to `Cpus`. + """ + if "CPUs" in key: + key = key.replace("CPUs", "Cpus") + key = _acronym.sub(r"-", key) + return _kebabize.sub(r"-", key).lower() + + +def install() -> None: + """Install Slurm.""" + # FIXME: Pin slurm to the stable channel + _snap("install", "slurm", "--channel", "latest/candidate", "--classic") + + +def version() -> str: + """Get the current version of Slurm installed on the system.""" + info = yaml.safe_load(_snap("info", "slurm")) + ver: str = info["installed"] + return ver.split(maxsplit=1)[0] + + +def _call(cmd: str, *args: str, stdin: Optional[str] = None) -> str: + """Call a command with logging. + + Raises: + SlurmOpsError: Raised if the command fails. + """ + cmd = [cmd, *args] + _logger.debug(f"Executing command {cmd}") + try: + return subprocess.check_output(cmd, input=stdin, stderr=subprocess.PIPE, text=True).strip() + except subprocess.CalledProcessError as e: + _logger.error(f"`{' '.join(cmd)}` failed") + _logger.error(f"stderr: {e.stderr.decode()}") + raise SlurmOpsError(f"command {cmd[0]} failed. Reason:\n{e.stderr.decode()}") + + +def _snap(*args) -> str: + """Control snap by via executed `snap ...` commands. + + Raises: + subprocess.CalledProcessError: Raised if snap command fails. + """ + return _call("snap", *args) + + +def _mungectl(*args: str, stdin: Optional[str] = None) -> str: + """Control munge via `slurm.mungectl ...`. + + Args: + *args: Arguments to pass to `mungectl`. + stdin: Input to pass to `mungectl` via stdin. + + Raises: + subprocess.CalledProcessError: Raised if `mungectl` command fails. + """ + return _call("slurm.mungectl", *args, stdin=stdin) + + +class ServiceType(Enum): + """Type of Slurm service to manage.""" + + MUNGED = "munged" + SLURMD = "slurmd" + SLURMCTLD = "slurmctld" + SLURMDBD = "slurmdbd" + SLURMRESTD = "slurmrestd" + + @property + def config_name(self) -> str: + """Configuration name on the slurm snap for this service type.""" + if self is ServiceType.SLURMCTLD: + return "slurm" + if self is ServiceType.MUNGED: + return "munge" + + return self.value + + +class ServiceManager: + """Control a Slurm service.""" + + def enable(self) -> None: + """Enable service.""" + _snap("start", "--enable", f"slurm.{self._service.value}") + + def disable(self) -> None: + """Disable service.""" + _snap("stop", "--disable", f"slurm.{self._service.value}") + + def restart(self) -> None: + """Restart service.""" + _snap("restart", f"slurm.{self._service.value}") + + +class ConfigurationManager: + """Control configuration of a Slurm component.""" + + def __init__(self, name: str) -> None: + self._name = name + + def get_options(self, *keys: str) -> Mapping[str, Any]: + """Get given configurations values for Slurm component.""" + configs = {} + for key in keys: + config = self.get(key) + target = key.rsplit(".", maxsplit=1)[-1] + configs[target] = config + + return configs + + def get(self, key: Optional[str] = None) -> Any: + """Get specific configuration value for Slurm component.""" + key = f"{self._name}.{key}" if key else self._name + config = json.loads(_snap("get", "-d", "slurm", key)) + return config[key] + + def set(self, config: Mapping[str, Any]) -> None: + """Set configuration for Slurm component.""" + args = [f"{self._name}.{k}={json.dumps(v)}" for k, v in config.items()] + _snap("set", "slurm", *args) + + def unset(self, *keys: str) -> None: + """Unset configuration for Slurm component.""" + args = [f"{self._name}.{k}" for k in keys] if len(keys) > 0 else [self._name] + _snap("unset", "slurm", *args) + + +class MungeManager(ServiceManager): + """Manage `munged` service operations.""" + + def __init__(self) -> None: + service = ServiceType.MUNGED + self._service = service + self.config = ConfigurationManager(service.config_name) + + def get_key(self) -> str: + """Get the current munge key. + + Returns: + The current munge key as a base64-encoded string. + """ + return _mungectl("key", "get") + + def set_key(self, key: str) -> None: + """Set a new munge key. + + Args: + key: A new, base64-encoded munge key. + """ + _mungectl("key", "set", stdin=key) + + def generate_key(self) -> None: + """Generate a new, cryptographically secure munge key.""" + _mungectl("key", "generate") + + +class SlurmManagerBase(ServiceManager): + """Base manager for Slurm services.""" + + def __init__(self, service: ServiceType) -> None: + self._service = service + self.config = ConfigurationManager(service.config_name) + self.munge = MungeManager() diff --git a/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py b/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py index 08157c9..024047e 100644 --- a/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py +++ b/lib/charms/operator_libs_linux/v0/juju_systemd_notices.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -# Copyright 2023 Canonical Ltd. +# Copyright 2023-2024 Canonical Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ ```python from charms.operator_libs_linux.v0.juju_systemd_notices import ( + Service, ServiceStartedEvent, ServiceStoppedEvent, SystemdNotices, @@ -41,7 +42,7 @@ def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) # Register services with charm. This adds the events to observe. - self._systemd_notices = SystemdNotices(self, ["slurmd"]) + self._systemd_notices = SystemdNotices(self, Service("snap.slurm.slurmd", alias="slurmd")) self.framework.observe(self.on.install, self._on_install) self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on.service_slurmd_started, self._on_slurmd_started) @@ -58,7 +59,7 @@ def _on_install(self, _: InstallEvent) -> None: def _on_start(self, _: StartEvent) -> None: # This will trigger the juju-systemd-notices daemon to # emit a `service-slurmd-started` event. - systemd.service_start("slurmd") + snap.slurmd.enable() def _on_stop(self, _: StopEvent) -> None: # To stop the juju-systemd-notices service running in the background. @@ -72,26 +73,27 @@ def _on_slurmd_started(self, _: ServiceStartedEvent) -> None: # This will trigger the juju-systemd-notices daemon to # emit a `service-slurmd-stopped` event. - systemd.service_stop("slurmd") + snap.slurmd.stop() def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: self.unit.status = BlockedStatus("slurmd not running") ``` """ -__all__ = ["ServiceStartedEvent", "ServiceStoppedEvent", "SystemdNotices"] +__all__ = ["Service", "ServiceStartedEvent", "ServiceStoppedEvent", "SystemdNotices"] import argparse import asyncio import functools import logging -import re import signal import subprocess import textwrap +from dataclasses import dataclass from pathlib import Path -from typing import List +from typing import Mapping, Optional +import yaml from dbus_fast.aio import MessageBus from dbus_fast.constants import BusType, MessageType from dbus_fast.errors import DBusError @@ -99,6 +101,10 @@ def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: from ops.charm import CharmBase from ops.framework import EventBase +# FIXME: This is a custom version of `juju-systemd-notices`. Upstream does not yet have +# patches for observing the state of snap services. Will sync with upstream again once +# gh:canonical/operator-libs-linux#128 lands against upstream. + # The unique Charmhub library identifier, never change it. LIBID = "2bb6ecd037e64c899033113abab02e01" @@ -111,12 +117,11 @@ def _on_slurmd_stopped(self, _: ServiceStoppedEvent) -> None: # juju-systemd-notices charm library dependencies. # Charm library dependencies are installed when the consuming charm is packed. -PYDEPS = ["dbus-fast>=1.90.2"] +PYDEPS = ["dbus-fast>=1.90.2", "pyyaml>=6.0.1"] _logger = logging.getLogger(__name__) _juju_unit = None _service_states = {} -_service_hook_regex_filter = re.compile(r"service-(?P[\w\\:-]*)-(?:started|stopped)") _DBUS_CHAR_MAPPINGS = { "_5f": "_", # _ must be first since char mappings contain _. "_40": "@", @@ -148,6 +153,22 @@ def _systemctl(*args) -> None: _disable_service = functools.partial(_systemctl, "disable") +@dataclass +class Service: + """Systemd service to observe. + + Args: + name: Name of systemd service to observe on dbus. + alias: Event name alias for service. + """ + + name: str + alias: Optional[str] = None + + def __post_init__(self) -> None: # noqa D105 + self.alias = self.alias or self.name + + class ServiceStartedEvent(EventBase): """Event emitted when service has started.""" @@ -159,7 +180,7 @@ class ServiceStoppedEvent(EventBase): class SystemdNotices: """Observe systemd services on your machine base.""" - def __init__(self, charm: CharmBase, services: List[str]) -> None: + def __init__(self, charm: CharmBase, *services: Service) -> None: """Instantiate systemd notices service.""" self._charm = charm self._services = services @@ -170,39 +191,65 @@ def __init__(self, charm: CharmBase, services: List[str]) -> None: "Attaching systemd notice events to charm %s", self._charm.__class__.__name__ ) for service in self._services: - self._charm.on.define_event(f"service_{service}_started", ServiceStartedEvent) - self._charm.on.define_event(f"service_{service}_stopped", ServiceStoppedEvent) + self._charm.on.define_event(f"service_{service.alias}_started", ServiceStartedEvent) + self._charm.on.define_event(f"service_{service.alias}_stopped", ServiceStoppedEvent) def subscribe(self) -> None: """Subscribe charmed operator to observe status of systemd services.""" + self._generate_hooks() + self._generate_config() + self._start() + + def stop(self) -> None: + """Stop charmed operator from observing the status of subscribed services.""" + _stop_service(self._service_file.name) + # Notices daemon is disabled so that the service will not restart after machine reboot. + _disable_service(self._service_file.name) + + def _generate_hooks(self) -> None: + """Generate legacy event hooks for observed systemd services.""" _logger.debug("Generating systemd notice hooks for %s", self._services) - start_hooks = [Path(f"hooks/service-{service}-started") for service in self._services] - stop_hooks = [Path(f"hooks/service-{service}-stopped") for service in self._services] + start_hooks = [Path(f"hooks/service-{s.alias}-started") for s in self._services] + stop_hooks = [Path(f"hooks/service-{s.alias}-stopped") for s in self._services] for hook in start_hooks + stop_hooks: if hook.exists(): _logger.debug("Hook %s already exists. Skipping...", hook.name) else: hook.symlink_to(self._charm.framework.charm_dir / "dispatch") + def _generate_config(self) -> None: + """Generate watch file for systemd notices daemon.""" + _logger.debug("Generating watch file for %s", self._services) + config = {"services": {s.name: s.alias for s in self._services}} + + config_file = self._charm.framework.charm_dir / "watch.yaml" + if config_file.exists(): + _logger.debug("Overwriting existing watch file %s", config_file.name) + with config_file.open("wt") as fout: + yaml.dump(config, fout) + config_file.chmod(0o600) + + def _start(self) -> None: + """Start systemd notices daemon to observe subscribed services.""" _logger.debug("Starting %s daemon", self._service_file.name) if self._service_file.exists(): _logger.debug("Overwriting existing service file %s", self._service_file.name) self._service_file.write_text( textwrap.dedent( f""" - [Unit] - Description=Juju systemd notices daemon - After=multi-user.target - - [Service] - Type=simple - Restart=always - WorkingDirectory={self._charm.framework.charm_dir} - Environment="PYTHONPATH={self._charm.framework.charm_dir / "venv"}" - ExecStart=/usr/bin/python3 {__file__} {self._charm.unit.name} - - [Install] - WantedBy=multi-user.target + [Unit] + Description=Juju systemd notices daemon + After=multi-user.target + + [Service] + Type=simple + Restart=always + WorkingDirectory={self._charm.framework.charm_dir} + Environment="PYTHONPATH={self._charm.framework.charm_dir / "venv"}" + ExecStart=/usr/bin/python3 {__file__} {self._charm.unit.name} + + [Install] + WantedBy=multi-user.target """ ).strip() ) @@ -214,12 +261,6 @@ def subscribe(self) -> None: _start_service(self._service_file.name) _logger.debug("Started %s daemon", self._service_file.name) - def stop(self) -> None: - """Stop charmed operator from observing the status of subscribed services.""" - _stop_service(self._service_file.name) - # Notices daemon is disabled so that the service will not restart after machine reboot. - _disable_service(self._service_file.name) - def _name_to_dbus_path(name: str) -> str: """Convert the specified name into an org.freedesktop.systemd1.Unit path handle. @@ -256,6 +297,16 @@ def _dbus_path_to_name(path: str) -> str: return name +@functools.lru_cache(maxsize=32) +def _read_config() -> Mapping[str, str]: + """Read systemd notices daemon configuration to service names and aliases.""" + config_file = Path.cwd() / "watch.yaml" + _logger.debug("Loading observed services from configuration file %s", config_file) + + with config_file.open("rt") as fin: + return yaml.safe_load(fin)["services"] + + def _systemd_unit_changed(msg: Message) -> bool: """Send Juju notification if systemd unit state changes on the DBus bus. @@ -310,8 +361,10 @@ async def _send_juju_notification(service: str, state: str) -> None: if service.endswith(".service"): service = service[0:-len(".service")] # fmt: skip + watched_services = _read_config() + alias = watched_services[service] event_name = "started" if state == "active" else "stopped" - hook = f"service-{service}-{event_name}" + hook = f"service-{alias}-{event_name}" cmd = ["/usr/bin/juju-exec", _juju_unit, f"hooks/{hook}"] _logger.debug("Invoking hook %s with command: %s", hook, " ".join(cmd)) @@ -364,20 +417,8 @@ async def _async_load_services() -> None: will be queried from systemd to determine it's initial state. """ global _juju_unit - hooks_dir = Path.cwd() / "hooks" - _logger.info("Loading services from hooks in %s", hooks_dir) - - if not hooks_dir.exists(): - _logger.warning("Hooks dir %s does not exist.", hooks_dir) - return - - watched_services = [] - # Get service-{service}-(started|stopped) hooks defined by the charm. - for hook in hooks_dir.iterdir(): - match = _service_hook_regex_filter.match(hook.name) - if match: - watched_services.append(match.group("service")) + watched_services = _read_config() _logger.info("Services from hooks are %s", watched_services) if not watched_services: return @@ -386,7 +427,7 @@ async def _async_load_services() -> None: # Loop through all the services and be sure that a new watcher is # started for new ones. - for service in watched_services: + for service in watched_services.keys(): # The .service suffix is necessary and will cause lookup failures of the # service unit when readying the watcher if absent from the service name. service = f"{service}.service" diff --git a/src/charm.py b/src/charm.py index c004fcf..dd27a40 100755 --- a/src/charm.py +++ b/src/charm.py @@ -10,6 +10,7 @@ from typing import Any, Dict from charms.operator_libs_linux.v0.juju_systemd_notices import ( # type: ignore[import-untyped] + Service, ServiceStartedEvent, ServiceStoppedEvent, SystemdNotices, @@ -32,7 +33,6 @@ ) from slurm_conf_editor import Node, Partition from slurmd_ops import SlurmdManager -from utils import slurmd logger = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def __init__(self, *args, **kwargs): self._slurmd_manager = SlurmdManager() self._slurmctld = Slurmctld(self, "slurmctld") - self._systemd_notices = SystemdNotices(self, ["slurmd"]) + self._systemd_notices = SystemdNotices(self, Service("snap.slurm.slurmd", "slurmd")) event_handler_bindings = { self.on.install: self._on_install, @@ -82,7 +82,6 @@ def _on_install(self, event: InstallEvent) -> None: if self._slurmd_manager.install(): self.unit.set_workload_version(self._slurmd_manager.version()) - slurmd.override_service() self._systemd_notices.subscribe() self._stored.slurm_installed = True @@ -142,7 +141,7 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: if (slurmctld_host := event.slurmctld_host) != self._stored.slurmctld_host: if slurmctld_host is not None: - slurmd.override_default(slurmctld_host) + self._slurmd_manager.set_conf_server(slurmctld_host) self._stored.slurmctld_host = slurmctld_host logger.debug(f"slurmctld_host={slurmctld_host}") else: @@ -178,7 +177,8 @@ def _on_slurmctld_available(self, event: SlurmctldAvailableEvent) -> None: else: logger.error("## Unable to restart munge") - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() self._check_status() def _on_slurmctld_unavailable(self, event) -> None: @@ -188,7 +188,7 @@ def _on_slurmctld_unavailable(self, event) -> None: self._stored.nhc_params = "" self._stored.munge_key = "" self._stored.slurmctld_host = "" - slurmd.stop() + self._slurmd_manager._manager.disable() self._check_status() def _on_slurmd_started(self, _: ServiceStartedEvent) -> None: @@ -204,7 +204,8 @@ def _on_node_configured_action(self, _: ActionEvent) -> None: # Trigger reconfiguration of slurmd node. self._new_node = False self._slurmctld.set_node() - slurmd.restart() + self._slurmd_manager._manager.enable() + self._slurmd_manager._manager.restart() logger.debug("### This node is not new anymore") def _on_show_nhc_config(self, event: ActionEvent) -> None: diff --git a/src/constants.py b/src/constants.py index 21b428f..6eec976 100644 --- a/src/constants.py +++ b/src/constants.py @@ -6,37 +6,4 @@ SLURM_USER = "root" SLURM_GROUP = "root" -MUNGE_KEY_PATH = Path("/etc/munge/munge.key") - -UBUNTU_HPC_PPA_KEY = """ ------BEGIN PGP PUBLIC KEY BLOCK----- -Comment: Hostname: -Version: Hockeypuck 2.1.1-10-gec3b0e7 - -xsFNBGTuZb8BEACtJ1CnZe6/hv84DceHv+a54y3Pqq0gqED0xhTKnbj/E2ByJpmT -NlDNkpeITwPAAN1e3824Me76Qn31RkogTMoPJ2o2XfG253RXd67MPxYhfKTJcnM3 -CEkmeI4u2Lynh3O6RQ08nAFS2AGTeFVFH2GPNWrfOsGZW03Jas85TZ0k7LXVHiBs -W6qonbsFJhshvwC3SryG4XYT+z/+35x5fus4rPtMrrEOD65hij7EtQNaE8owuAju -Kcd0m2b+crMXNcllWFWmYMV0VjksQvYD7jwGrWeKs+EeHgU8ZuqaIP4pYHvoQjag -umqnH9Qsaq5NAXiuAIAGDIIV4RdAfQIR4opGaVgIFJdvoSwYe3oh2JlrLPBlyxyY -dayDifd3X8jxq6/oAuyH1h5K/QLs46jLSR8fUbG98SCHlRmvozTuWGk+e07ALtGe -sGv78ToHKwoM2buXaTTHMwYwu7Rx8LZ4bZPHdersN1VW/m9yn1n5hMzwbFKy2s6/ -D4Q2ZBsqlN+5aW2q0IUmO+m0GhcdaDv8U7RVto1cWWPr50HhiCi7Yvei1qZiD9jq -57oYZVqTUNCTPxi6NeTOdEc+YqNynWNArx4PHh38LT0bqKtlZCGHNfoAJLPVYhbB -b2AHj9edYtHU9AAFSIy+HstET6P0UDxy02IeyE2yxoUBqdlXyv6FL44E+wARAQAB -zRxMYXVuY2hwYWQgUFBBIGZvciBVYnVudHUgSFBDwsGOBBMBCgA4FiEErocSHcPk -oLD4H/Aj9tDF1ca+s3sFAmTuZb8CGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AA -CgkQ9tDF1ca+s3sz3w//RNawsgydrutcbKf0yphDhzWS53wgfrs2KF1KgB0u/H+u -6Kn2C6jrVM0vuY4NKpbEPCduOj21pTCepL6PoCLv++tICOLVok5wY7Zn3WQFq0js -Iy1wO5t3kA1cTD/05v/qQVBGZ2j4DsJo33iMcQS5AjHvSr0nu7XSvDDEE3cQE55D -87vL7lgGjuTOikPh5FpCoS1gpemBfwm2Lbm4P8vGOA4/witRjGgfC1fv1idUnZLM -TbGrDlhVie8pX2kgB6yTYbJ3P3kpC1ZPpXSRWO/cQ8xoYpLBTXOOtqwZZUnxyzHh -gM+hv42vPTOnCo+apD97/VArsp59pDqEVoAtMTk72fdBqR+BB77g2hBkKESgQIEq -EiE1/TOISioMkE0AuUdaJ2ebyQXugSHHuBaqbEC47v8t5DVN5Qr9OriuzCuSDNFn -6SBHpahN9ZNi9w0A/Yh1+lFfpkVw2t04Q2LNuupqOpW+h3/62AeUqjUIAIrmfeML -IDRE2VdquYdIXKuhNvfpJYGdyvx/wAbiAeBWg0uPSepwTfTG59VPQmj0FtalkMnN -ya2212K5q68O5eXOfCnGeMvqIXxqzpdukxSZnLkgk40uFJnJVESd/CxHquqHPUDE -fy6i2AnB3kUI27D4HY2YSlXLSRbjiSxTfVwNCzDsIh7Czefsm6ITK2+cVWs0hNQ= -=cs1s ------END PGP PUBLIC KEY BLOCK----- -""" +SLURM_SNAP = Path("/snap/slurm/current") diff --git a/src/slurmd_ops.py b/src/slurmd_ops.py index dcab81b..e3a4a99 100644 --- a/src/slurmd_ops.py +++ b/src/slurmd_ops.py @@ -7,17 +7,15 @@ import shlex import subprocess import textwrap -from base64 import b64decode from grp import getgrnam from pathlib import Path from pwd import getpwnam from shutil import rmtree from typing import Any, Dict -import charms.operator_libs_linux.v0.apt as apt # type: ignore [import-untyped] +import charms.hpc_libs.v0.slurm_ops as slurm import charms.operator_libs_linux.v1.systemd as systemd # type: ignore [import-untyped] -import distro -from constants import MUNGE_KEY_PATH, SLURM_GROUP, SLURM_USER, UBUNTU_HPC_PPA_KEY +from constants import SLURM_GROUP, SLURM_SNAP, SLURM_USER logger = logging.getLogger() @@ -39,101 +37,24 @@ def __init__(self, msg): pass -class CharmedHPCPackageLifecycleManager: - """Facilitate ubuntu-hpc slurm component package lifecycles.""" - - def __init__(self, package_name: str): - self._package_name = package_name - self._keyring_path = Path(f"/usr/share/keyrings/ubuntu-hpc-{self._package_name}.asc") - - def _repo(self) -> apt.DebianRepository: - """Return the ubuntu-hpc repo.""" - ppa_url = "https://ppa.launchpadcontent.net/ubuntu-hpc/slurm-wlm-23.02/ubuntu" - sources_list = f"deb [signed-by={self._keyring_path}] {ppa_url} {distro.codename()} main" - return apt.DebianRepository.from_repo_line(sources_list) - - def install(self) -> bool: - """Install package using lib apt.""" - package_installed = False - - if self._keyring_path.exists(): - self._keyring_path.unlink() - self._keyring_path.write_text(UBUNTU_HPC_PPA_KEY) - - repositories = apt.RepositoryMapping() - repositories.add(self._repo()) - - try: - apt.update() - apt.add_package([self._package_name]) - package_installed = True - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - return package_installed - - def uninstall(self) -> None: - """Uninstall the package using libapt.""" - if apt.remove_package(self._package_name): - logger.info(f"'{self._package_name}' removed from system.") - else: - logger.error(f"'{self._package_name}' not found on system.") - - repositories = apt.RepositoryMapping() - repositories.disable(self._repo()) - - if self._keyring_path.exists(): - self._keyring_path.unlink() - - def upgrade_to_latest(self) -> None: - """Upgrade package to latest.""" - try: - slurm_package = apt.DebianPackage.from_system(self._package_name) - slurm_package.ensure(apt.PackageState.Latest) - logger.info(f"Updated '{self._package_name}' to: {slurm_package.version.number}.") - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found in package cache or on system.") - except apt.PackageError as e: - logger.error(f"Could not install '{self._package_name}'. Reason: {e.message}") - - def version(self) -> str: - """Return the package version.""" - slurm_package_vers = "" - try: - slurm_package_vers = apt.DebianPackage.from_installed_package( - self._package_name - ).version.number - except apt.PackageNotFoundError: - logger.error(f"'{self._package_name}' not found on system.") - return slurm_package_vers - - class SlurmdManager: """SlurmdManager.""" def __init__(self): - self._munge_package = CharmedHPCPackageLifecycleManager("munge") - self._slurmd_package = CharmedHPCPackageLifecycleManager("slurmd") - self._slurm_client_package = CharmedHPCPackageLifecycleManager("slurm-client") + self._manager = slurm.SlurmManagerBase(slurm.ServiceType.SLURMD) def install(self) -> bool: """Install slurmd, slurm-client and munge packages to the system.""" - if self._slurmd_package.install() is not True: - logger.debug("Cannot install 'slurmd' package.") - return False - - systemd.service_stop("slurmd") + slurm.install() - if self._munge_package.install() is not True: - logger.debug("Cannot install 'munge' package.") - return False + self._manager.disable() + self._manager.munge.disable() - systemd.service_stop("munge") + os.symlink( + "/etc/systemd/system/snap.slurm.slurmd.service", "/etc/systemd/system/slurm.service" + ) - if self._slurm_client_package.install() is not True: - logger.debug("Cannot install 'slurm-client' package.") + if not systemd.daemon_reload(): return False if not self._install_nhc_from_tarball(): @@ -145,19 +66,15 @@ def install(self) -> bool: spool_dir = Path("/var/spool/slurmd") spool_dir.mkdir() - slurm_user_uid, slurm_group_gid = _get_slurm_user_uid_and_slurm_group_gid() - os.chown(f"{spool_dir}", slurm_user_uid, slurm_group_gid) - return True def version(self) -> str: """Return slurm version.""" - return self._slurmd_package.version() + return slurm.version() def write_munge_key(self, munge_key: str) -> None: """Base64 decode and write the munge key.""" - key = b64decode(munge_key.encode()) - MUNGE_KEY_PATH.write_bytes(key) + self._manager.munge.set_key(munge_key) def _install_nhc_from_tarball(self) -> bool: """Install NHC from tarball that is packaged with the charm. @@ -173,7 +90,12 @@ def _install_nhc_from_tarball(self) -> bool: base_path.mkdir() cmd = f"tar --extract --directory {base_path} --file lbnl-nhc-1.4.3.tar.gz".split() - subprocess.run(cmd) + try: + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT, text=True) + logger.debug(result) + except subprocess.CalledProcessError as e: + logger.error("failed to extract NHC using tar. reason:\n%s", e.stdout) + return False full_path = base_path / os.listdir(base_path)[0] @@ -271,26 +193,30 @@ def restart_munged(self) -> bool: """ try: logger.debug("## Restarting munge") - systemd.service_restart("munge") - except SlurmdException("Cannot start munge.") as e: # type: ignore [misc] + self._manager.munge.enable() + self._manager.munge.restart() + except slurm.SlurmOpsError as e: # type: ignore [misc] logger.error(e) return False return self.check_munged() def check_munged(self) -> bool: """Check if munge is working correctly.""" - if not systemd.service_running("munge"): + if not systemd.service_running("snap.slurm.munged"): return False # check if munge is working, i.e., can use the credentials correctly try: logger.debug("## Testing if munge is working correctly") - cmd = "munge -n" + cmd = "slurm.munge -n" munge = subprocess.Popen( shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE ) unmunge = subprocess.Popen( - ["unmunge"], stdin=munge.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["slurm.unmunge"], + stdin=munge.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) if munge is not None: munge.stdout.close() # type: ignore [union-attr] @@ -310,7 +236,9 @@ def get_node_config(self) -> Dict[Any, Any]: """Return the node configuration options as reported by slurmd -C.""" slurmd_config_options = "" try: - slurmd_config_options = subprocess.check_output(["slurmd", "-C"], text=True).strip() + slurmd_config_options = subprocess.check_output( + [SLURM_SNAP / "sbin" / "slurmd", "-C"], text=True + ).strip() except subprocess.CalledProcessError as e: logger.error(e) raise e @@ -327,3 +255,11 @@ def get_node_config(self) -> Dict[Any, Any]: raise e return slurmd_config_options_parsed + + def set_conf_server(self, server: str) -> None: + """Set the config server that provides the config file. + + Args: + server: Server hostname of the slurmctld service. + """ + self._manager.config.set({"config-server": server}) diff --git a/src/templates/override.conf b/src/templates/override.conf deleted file mode 100644 index d880806..0000000 --- a/src/templates/override.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Service] -LimitMEMLOCK=infinity -LimitNOFILE=1048576 diff --git a/src/utils/slurmd.py b/src/utils/slurmd.py index ab5ed2b..9a0c5e7 100644 --- a/src/utils/slurmd.py +++ b/src/utils/slurmd.py @@ -32,21 +32,6 @@ _logger = logging.getLogger(__name__) -def start() -> None: - """Start slurmd service.""" - systemd.service_start("slurmd") - - -def stop() -> None: - """Stop slurmd service.""" - systemd.service_stop("slurmd") - - -def restart() -> None: - """Restart slurmd service.""" - systemd.service_restart("slurmd") - - def override_default(host: str) -> None: """Override the /etc/default/slurmd file.