Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nixos: minimize GPU image #1092

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 46 additions & 20 deletions packages/by-name/kata/kata-kernel-uvm/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
# SPDX-License-Identifier: AGPL-3.0-only

{
lib,
fetchurl,
linuxManualConfig,
stdenvNoCC,
fetchzip,
kata,
withGPU ? false,
... # Required for invocation through `linuxPackagesFor`, which calls this with the `features` argument.
}:

Expand All @@ -20,28 +22,51 @@ let
hash = "sha256-VcbOY86p8VkI6XvdhHfZNnWVHKuMLW7Xj7uzHHDiVsk=";
};

postPatch = ''
config=$(find . -regex '.*/config-[0-9.-]+-confidential')
postPatch =
(
if withGPU then
''
config=$(find . -regex '.*/config-[0-9.-]+-nvidia-gpu-confidential')

# 1. We don't use an initrd.
# 2. Enable dm-init, so that we can use `dm-mod.create`.
# 3. Disable module signing to make the build reproducable.
substituteInPlace $config \
--replace-fail 'CONFIG_INITRAMFS_SOURCE="initramfs.cpio.gz"' 'CONFIG_INITRAMFS_SOURCE=""' \
--replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' \
--replace-fail '# CONFIG_DM_INIT is not set' 'CONFIG_DM_INIT=y' \
--replace-fail '# CONFIG_DMIID is not set' 'CONFIG_DMIID=y' \
--replace-fail '# CONFIG_TMPFS_POSIX_ACL is not set' 'CONFIG_TMPFS_POSIX_ACL=y' \
--replace-fail '# CONFIG_TMPFS_XATTR is not set' 'CONFIG_TMPFS_XATTR=y' \
--replace-fail '# CONFIG_EFIVAR_FS is not set' 'CONFIG_EFIVAR_FS=y' \
--replace-fail '# CONFIG_RD_ZSTD is not set' 'CONFIG_RD_ZSTD=y' \
--replace-fail '# CONFIG_VFAT_FS is not se' 'CONFIG_VFAT_FS=y' \
--replace-fail '# CONFIG_NLS_CODEPAGE_437 is not set' 'CONFIG_NLS_CODEPAGE_437=y' \
--replace-fail '# CONFIG_NLS_ISO8859_1 is not set' 'CONFIG_NLS_ISO8859_1=y' \
--replace-fail '# CONFIG_ATA is not set' 'CONFIG_ATA=y'
# Enable dm-init, so that we can use `dm-mod.create`.
cat <<EOF >> $config
CONFIG_MD=y
CONFIG_BLK_DEV_DM=y
CONFIG_DM_INIT=y
CONFIG_DM_CRYPT=y
CONFIG_DM_VERITY=y
CONFIG_DM_INTEGRITY=y
EOF
''
else
''
config=$(find . -regex '.*/config-[0-9.-]+-confidential')

echo "CONFIG_ATA_PIIX=y" >> $config
'';
# 1. Our kernel build is independent of any initrd.
# 2. Enable dm-init, so that we can use `dm-mod.create`.
substituteInPlace $config \
--replace-fail 'CONFIG_INITRAMFS_SOURCE="initramfs.cpio.gz"' 'CONFIG_INITRAMFS_SOURCE=""' \
--replace-fail '# CONFIG_DM_INIT is not set' 'CONFIG_DM_INIT=y'
''
)
+ ''
# 1. Disable module signing to make the build reproducable.
# 2. Add some options to enable using the kernel in NixOS. (As NixOS has a hard check on
# whether all modules required for systemd are present, e.g.)
substituteInPlace $config \
--replace-fail 'CONFIG_MODULE_SIG=y' 'CONFIG_MODULE_SIG=n' \
--replace-fail '# CONFIG_DMIID is not set' 'CONFIG_DMIID=y' \
--replace-fail '# CONFIG_TMPFS_POSIX_ACL is not set' 'CONFIG_TMPFS_POSIX_ACL=y' \
--replace-fail '# CONFIG_TMPFS_XATTR is not set' 'CONFIG_TMPFS_XATTR=y' \
--replace-fail '# CONFIG_EFIVAR_FS is not set' 'CONFIG_EFIVAR_FS=y' \
--replace-fail '# CONFIG_RD_ZSTD is not set' 'CONFIG_RD_ZSTD=y' \
--replace-fail '# CONFIG_VFAT_FS is not se' 'CONFIG_VFAT_FS=y' \
--replace-fail '# CONFIG_NLS_CODEPAGE_437 is not set' 'CONFIG_NLS_CODEPAGE_437=y' \
--replace-fail '# CONFIG_NLS_ISO8859_1 is not set' 'CONFIG_NLS_ISO8859_1=y' \
--replace-fail '# CONFIG_ATA is not set' 'CONFIG_ATA=y'

echo "CONFIG_ATA_PIIX=y" >> $config
'';

dontBuild = true;

Expand All @@ -57,6 +82,7 @@ in

linuxManualConfig rec {
version = "6.11";
modDirVersion = "${version}.0" + lib.optionalString withGPU "-nvidia-gpu-confidential";

# See https://github.com/kata-containers/kata-containers/blob/5f11c0f144037d8d8f546c89a0392dcd84fa99e2/versions.yaml#L198-L201
src = fetchurl {
Expand Down
90 changes: 74 additions & 16 deletions packages/nixos/gpu.nix
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,68 @@

let
cfg = config.contrast.gpu;

nvidiaPackage =
(
(config.boot.kernelPackages.nvidiaPackages.mkDriver {
# TODO(msanft): Investigate why the latest version breaks
# GPU containers.
msanft marked this conversation as resolved.
Show resolved Hide resolved
version = "550.90.07";
sha256_64bit = "sha256-Uaz1edWpiE9XOh0/Ui5/r6XnhB4iqc7AtLvq4xsLlzM=";
sha256_aarch64 = "sha256-uJa3auRlMHr8WyacQL2MyyeebqfT7K6VU0qR7LGXFXI=";
openSha256 = "sha256-VLmh7eH0xhEu/AK+Osb9vtqAFni+lx84P/bo4ZgCqj8=";
settingsSha256 = "sha256-sX9dHEp9zH9t3RWp727lLCeJLo8QRAGhVb8iN6eX49g=";
persistencedSha256 = "sha256-qe8e1Nxla7F0U88AbnOZm6cHxo57pnLCqtjdvOvq9jk=";
}).override
{
disable32Bit = true;
}
).overrideAttrs
(oldAttrs: {
# We strip the driver package by its dependencies on desktop software like Wayland and X11.
msanft marked this conversation as resolved.
Show resolved Hide resolved
# For server use-cases, we shouldn't need these. The Mesa (and thus Perl) and libGL dependencies are dropped
# too, as GPU workloads will likely be AI-related and not graphical. The libdrm dependency is dropped as well,
# as we're probably not going to be watching Netflix on the servers.
# Source: https://github.com/NixOS/nixpkgs/blob/eac1633a086e8e109e00ce58c0b47721da1dbdfd/pkgs/os-specific/linux/nvidia-x11/generic.nix#L100C3-L114C6
libPath = lib.makeLibraryPath (
with pkgs;
[
zlib
stdenv.cc.cc
openssl
dbus # for nvidia-powerd
]
);

# Hack to pass the "right" (i.e. the overridden) version of the nvidia driver to the persistenced.
# Looking at the package definition, it _should_ already do so, but it doesn't.
# So for now, override all occurences of `nvidia_x11` in the persistenced package "manually".
# We can't do an `override` on persistenced itself unfortunately, as it's call site doesn't allow this:
# https://github.com/NixOS/nixpkgs/blob/4d2418ebbfb107485b44aaa1b2909409322d9061/pkgs/os-specific/linux/nvidia-x11/generic.nix#L260
# TODO(msanft): Clarify with upstream why that is the case.
passthru = oldAttrs.passthru // {
persistenced = oldAttrs.passthru.persistenced.overrideAttrs (oldAttrs: {
inherit (nvidiaPackage) version makeFlags;
src = oldAttrs.src // {
rev = nvidiaPackage.version;
};

postFixup = ''
# Save a copy of persistenced for mounting in containers
burgerdev marked this conversation as resolved.
Show resolved Hide resolved
mkdir $out/origBin
cp $out/{bin,origBin}/nvidia-persistenced
patchelf --set-interpreter /lib64/ld-linux-x86-64.so.2 $out/origBin/nvidia-persistenced
burgerdev marked this conversation as resolved.
Show resolved Hide resolved

patchelf --set-rpath "$(patchelf --print-rpath $out/bin/nvidia-persistenced):${nvidiaPackage}/lib" \
$out/bin/nvidia-persistenced
'';

meta = oldAttrs.meta // {
inherit (nvidiaPackage.meta) platforms;
};
});
};
});
in

{
Expand All @@ -20,26 +82,22 @@ in
config = lib.mkIf cfg.enable {
hardware.nvidia = {
open = true;
package = lib.mkDefault (
config.boot.kernelPackages.nvidiaPackages.mkDriver {
# TODO: Investigate why the latest version breaks
# GPU containers.
version = "550.90.07";
sha256_64bit = "sha256-Uaz1edWpiE9XOh0/Ui5/r6XnhB4iqc7AtLvq4xsLlzM=";
sha256_aarch64 = "sha256-uJa3auRlMHr8WyacQL2MyyeebqfT7K6VU0qR7LGXFXI=";
openSha256 = "sha256-VLmh7eH0xhEu/AK+Osb9vtqAFni+lx84P/bo4ZgCqj8=";
settingsSha256 = "sha256-sX9dHEp9zH9t3RWp727lLCeJLo8QRAGhVb8iN6eX49g=";
persistencedSha256 = "sha256-qe8e1Nxla7F0U88AbnOZm6cHxo57pnLCqtjdvOvq9jk=";
}
);
package = nvidiaPackage;
nvidiaPersistenced = true;
# Disable NVIDIA's GUI settings tool.
nvidiaSettings = false;
# We don't need video acceleration on a server. Disabling this
# saves quite some disk space.
videoAcceleration = false;
};
hardware.graphics = {
enable = true;
enable32Bit = true;
};

hardware.nvidia-container-toolkit.enable = true;

# Make NVIDIA the "default" graphics driver to replace Mesa,
# which saves us another Perl dependency.
hardware.graphics.package = nvidiaPackage;
hardware.graphics.package32 = nvidiaPackage;

image.repart.partitions."10-root".contents."/usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh".source =
lib.getExe pkgs.nvidia-ctk-oci-hook;

Expand Down