From 3dcfdf77bfe3308aac89a044fbd385610e5a5062 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:17:28 +0100 Subject: [PATCH 1/3] packages/nvidia-ctk-with-config: init This adds a preconfigured `nvidia-ctk` package for use with peer pods GPU containers. --- packages/by-name/mkNixosConfig/package.nix | 1 + .../nvidia-ctk-with-config/config.toml | 40 +++++++++++++++++++ .../nvidia-ctk-with-config/package.nix | 21 ++++++++++ 3 files changed, 62 insertions(+) create mode 100644 packages/by-name/nvidia-ctk-with-config/config.toml create mode 100644 packages/by-name/nvidia-ctk-with-config/package.nix diff --git a/packages/by-name/mkNixosConfig/package.nix b/packages/by-name/mkNixosConfig/package.nix index 1e00b77a2c..c4187f7b9a 100644 --- a/packages/by-name/mkNixosConfig/package.nix +++ b/packages/by-name/mkNixosConfig/package.nix @@ -41,6 +41,7 @@ lib.makeOverridable ( cloud-api-adaptor kernel-podvm-azure pause-bundle + nvidia-ctk-with-config ; inherit (outerPkgs.kata) kata-agent; }) diff --git a/packages/by-name/nvidia-ctk-with-config/config.toml b/packages/by-name/nvidia-ctk-with-config/config.toml new file mode 100644 index 0000000000..1ae191dd61 --- /dev/null +++ b/packages/by-name/nvidia-ctk-with-config/config.toml @@ -0,0 +1,40 @@ +#accept-nvidia-visible-devices-as-volume-mounts = false +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +disable-require = true +supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video" +#swarm-resource = "DOCKER_RESOURCE_GPU" + +[nvidia-container-cli] +no-pivot = true +debug = "/var/log/nvidia-kata-container/nvidia-container-toolkit.log" +environment = [] +ldcache = "/tmp/ld.so.cache" +ldconfig = "@@glibcbin@/bin/ldconfig" +load-kmods = true +no-cgroups = true +path = "@nvidia-container-cli@" +#root = "/run/nvidia/driver" +#user = "root:video" + +[nvidia-container-runtime] +debug = "/var/log/nvidia-kata-container/nvidia-container-runtime.log" +log-level = "debug" +mode = "cdi" +runtimes = ["docker-runc", "runc", "crun"] + +[nvidia-container-runtime.modes] + +[nvidia-container-runtime.modes.cdi] +annotation-prefixes = ["cdi.k8s.io/"] +default-kind = "nvidia.com/gpu" +spec-dirs = ["/var/run/cdi"] + +[nvidia-container-runtime.modes.csv] +mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + +[nvidia-container-runtime-hook] +path = "@nvidia-container-runtime-hook@" +skip-mode-detection = true + +[nvidia-ctk] +path = "@nvidia-ctk@" diff --git a/packages/by-name/nvidia-ctk-with-config/package.nix b/packages/by-name/nvidia-ctk-with-config/package.nix new file mode 100644 index 0000000000..bdf57a1910 --- /dev/null +++ b/packages/by-name/nvidia-ctk-with-config/package.nix @@ -0,0 +1,21 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +# This builds an nvidia-container-toolkit package with a custom config required +# for use in peer pods GPU containers. + +{ + nvidia-container-toolkit, + libnvidia-container, + replaceVars, + glibc, + lib, +}: +nvidia-container-toolkit.override { + configTemplatePath = replaceVars ./config.toml { + "nvidia-container-cli" = "${lib.getExe' libnvidia-container "nvidia-container-cli"}"; + "nvidia-container-runtime-hook" = "${lib.getExe' nvidia-container-toolkit "nvidia-container-runtime-hook"}"; + "nvidia-ctk" = "${lib.getExe' nvidia-container-toolkit "nvidia-ctk"}"; + "glibcbin" = "${lib.getBin glibc}"; + }; +} From 2f7e456cfdeca2bc20419a9d0d8d3cba3261cca2 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:18:02 +0100 Subject: [PATCH 2/3] packages/nvidia-ctk-oci-hook: init This adds a package for the OCI hook required to start peer pods GPU containers. --- packages/by-name/mkNixosConfig/package.nix | 1 + .../by-name/nvidia-ctk-oci-hook/package.nix | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 packages/by-name/nvidia-ctk-oci-hook/package.nix diff --git a/packages/by-name/mkNixosConfig/package.nix b/packages/by-name/mkNixosConfig/package.nix index c4187f7b9a..3b08d93d54 100644 --- a/packages/by-name/mkNixosConfig/package.nix +++ b/packages/by-name/mkNixosConfig/package.nix @@ -41,6 +41,7 @@ lib.makeOverridable ( cloud-api-adaptor kernel-podvm-azure pause-bundle + nvidia-ctk-oci-hook nvidia-ctk-with-config ; inherit (outerPkgs.kata) kata-agent; diff --git a/packages/by-name/nvidia-ctk-oci-hook/package.nix b/packages/by-name/nvidia-ctk-oci-hook/package.nix new file mode 100644 index 0000000000..dfd5d0457d --- /dev/null +++ b/packages/by-name/nvidia-ctk-oci-hook/package.nix @@ -0,0 +1,27 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + writeShellApplication, + nvidia-ctk-with-config, + lib, +}: +writeShellApplication { + name = "nvidia-ctk-oci-hook"; + + text = '' + # Log the o/p of the hook to a file + ${lib.getExe' nvidia-ctk-with-config "nvidia-container-runtime-hook"} \ + -config ${nvidia-ctk-with-config}/etc/nvidia-container-runtime/config.toml \ + -debug "$@" > /var/log/nvidia-hook.log 2>&1 + ''; + + meta = { + description = "OCI hook for nvidia-container-runtime"; + longDescription = '' + This is an OCI hook (prestart) for the nvidia-container-runtime. It is used to + facilitate GPU containers in peer pods with the necessary drivers, libraries, + and binaries to access the GPU. + ''; + }; +} From 219147e0b30cb7d8ba2f0baf7d60aaa777433710 Mon Sep 17 00:00:00 2001 From: Moritz Sanft <58110325+msanft@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:20:16 +0100 Subject: [PATCH 3/3] packages/nixos: place nvidia-ctk-oci-hook in image This places the OCI hook in the expected path in the podvm-image if GPU use is configured. --- packages/nixos/gpu.nix | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/nixos/gpu.nix b/packages/nixos/gpu.nix index 021174e341..790afcf7e4 100644 --- a/packages/nixos/gpu.nix +++ b/packages/nixos/gpu.nix @@ -1,7 +1,12 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ config, lib, ... }: +{ + config, + lib, + pkgs, + ... +}: let cfg = config.contrast.gpu; @@ -24,6 +29,8 @@ in }; hardware.nvidia-container-toolkit.enable = true; + image.repart.partitions."10-root".contents."/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source = lib.getExe pkgs.nvidia-ctk-oci-hook; + boot.initrd.kernelModules = [ # Extra kernel modules required to talk to the GPU in CC-Mode. "ecdsa_generic"