diff --git a/docs/docs/features-limitations.md b/docs/docs/features-limitations.md index 386b0bc0d4..69a1c028ef 100644 --- a/docs/docs/features-limitations.md +++ b/docs/docs/features-limitations.md @@ -34,3 +34,10 @@ Currently, this requires inspecting the iptables rules on startup or terminating The Contrast Coordinator is a singleton and can't be scaled to more than one instance. When this instance's pod is restarted, for example for node maintenance, it needs to be recovered manually. In a future release, we plan to support distributed Coordinator instances that can recover automatically. + +## Overriding Kata configuration + +Kata Containers supports [overriding certain configuration values via Kubernetes annotations](https://github.com/kata-containers/kata-containers/blob/b4da4b5e3b9b21048af9333b071235a57a3e9493/docs/how-to/how-to-set-sandbox-config-kata.md). + +It needs to be noted that setting these values is unsupported, and doing so may lead to unexpected +behaviour, as Contrast isn't tested against all possible configuration combinations. diff --git a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml index c37fd9b4ed..7dd1b0b590 100644 --- a/nodeinstaller/internal/constants/configuration-qemu-tdx.toml +++ b/nodeinstaller/internal/constants/configuration-qemu-tdx.toml @@ -18,6 +18,9 @@ cpu_features="-vmx-rdseed-exit,pmu=off" default_vcpus = 1 default_maxvcpus = 0 default_bridges = 1 +# On TDX, when lowering this, the patch: +# packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch +# needs to be updated accordingly. default_memory = 2048 default_maxmemory = 0 disable_block_device_use = false diff --git a/nodeinstaller/internal/constants/constants.go b/nodeinstaller/internal/constants/constants.go index 89628d918d..be75b60c20 100644 --- a/nodeinstaller/internal/constants/constants.go +++ b/nodeinstaller/internal/constants/constants.go @@ -64,17 +64,17 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "tdx", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "tdx", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams case platforms.K3sQEMUSNP: if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil { return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err) @@ -82,19 +82,18 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer config.Hypervisor["qemu"]["path"] = filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64") config.Hypervisor["qemu"]["firmware"] = filepath.Join(baseDir, "snp", "share", "OVMF.fd") config.Hypervisor["qemu"]["image"] = filepath.Join(baseDir, "share", "kata-containers.img") - config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") - delete(config.Hypervisor["qemu"], "initrd") config.Hypervisor["qemu"]["block_device_aio"] = "threads" config.Hypervisor["qemu"]["shared_fs"] = "none" config.Hypervisor["qemu"]["valid_hypervisor_paths"] = []string{filepath.Join(baseDir, "snp", "bin", "qemu-system-x86_64")} config.Hypervisor["qemu"]["rootfs_type"] = "erofs" - kernelParams := qemuExtraKernelParams + config.Hypervisor["qemu"]["initrd"] = filepath.Join(baseDir, "share", "kata-initrd.zst") + config.Hypervisor["qemu"]["kernel"] = filepath.Join(baseDir, "share", "kata-kernel") + // Replace the kernel params entirely (and don't append) since that's + // also what we do when calculating the launch measurement. + config.Hypervisor["qemu"]["kernel_params"] = qemuExtraKernelParams if debug { config.Hypervisor["qemu"]["enable_debug"] = true } - // Replace the kernel params entirely (and don't append) since that's - // also what we do when calculating the launch measurement. - config.Hypervisor["qemu"]["kernel_params"] = kernelParams default: return nil, fmt.Errorf("unsupported platform: %s", platform) } diff --git a/packages/by-name/OVMF-TDX/package.nix b/packages/by-name/OVMF-TDX/package.nix index be4941eed1..03a6930bb6 100644 --- a/packages/by-name/OVMF-TDX/package.nix +++ b/packages/by-name/OVMF-TDX/package.nix @@ -9,7 +9,7 @@ debug ? false, }: -edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" rec { +edk2.mkDerivation "OvmfPkg/IntelTdx/IntelTdxX64.dsc" { name = "OVMF-TDX"; buildFlags = lib.optionals debug [ "-D DEBUG_ON_SERIAL_PORT=TRUE" ]; diff --git a/packages/by-name/image-podvm/package.nix b/packages/by-name/image-podvm/package.nix index 26729aa10f..bf97f45e3d 100644 --- a/packages/by-name/image-podvm/package.nix +++ b/packages/by-name/image-podvm/package.nix @@ -15,5 +15,6 @@ buildVerityUKI (mkNixosConfig { debug.enable = withDebug; gpu.enable = withGPU; azure.enable = withCSP == "azure"; + peerpods.enable = true; }; }) diff --git a/packages/by-name/kata/contrast-node-installer-image/package.nix b/packages/by-name/kata/contrast-node-installer-image/package.nix index be31bc4d8b..d0e5c5c22b 100644 --- a/packages/by-name/kata/contrast-node-installer-image/package.nix +++ b/packages/by-name/kata/contrast-node-installer-image/package.nix @@ -47,6 +47,10 @@ let url = "file:///opt/edgeless/share/kata-kernel"; path = "/opt/edgeless/@@runtimeName@@/share/kata-kernel"; } + { + url = "file:///opt/edgeless/share/kata-initrd.zst"; + path = "/opt/edgeless/@@runtimeName@@/share/kata-initrd.zst"; + } { url = "file:///opt/edgeless/snp/bin/qemu-system-x86_64"; path = "/opt/edgeless/@@runtimeName@@/snp/bin/qemu-system-x86_64"; @@ -106,7 +110,7 @@ let } ]; inherit debugRuntime; - qemuExtraKernelParams = kata.snp-launch-digest.dmVerityArgs; + qemuExtraKernelParams = kata.kata-image.cmdline; }; destination = "/config/contrast-node-install.json"; } @@ -116,13 +120,17 @@ let kata-container-img = ociLayerTar { files = [ { - source = kata.kata-image; + source = "${kata.kata-image.image}/${kata.kata-image.imageFileName}"; destination = "/opt/edgeless/share/kata-containers.img"; } { - source = "${kata.kata-kernel-uvm}/bzImage"; + source = "${kata.kata-image.kernel}/bzImage"; destination = "/opt/edgeless/share/kata-kernel"; } + { + source = "${kata.kata-image.initialRamdisk}/initrd"; + destination = "/opt/edgeless/share/kata-initrd.zst"; + } ]; }; diff --git a/packages/by-name/kata/kata-image/buildimage.sh b/packages/by-name/kata/kata-image/buildimage.sh deleted file mode 100644 index 18fe86df5a..0000000000 --- a/packages/by-name/kata/kata-image/buildimage.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2024 Edgeless Systems GmbH -# SPDX-License-Identifier: AGPL-3.0-only - -set -euo pipefail -shopt -s inherit_errexit - -# Image layout: -# -# +---------------------------------+-------------------+-------------------------+ -# | 512B DOS MBR (padded to 1 MiB) | p0 rootfs | p1 hashtree | -# +---------------------------------+-------------------+-------------------------+ -# | | | | -# 0 1MiB 1MiB + rootfs_size 1MiB + rootfs_size + hashtree_size - -# rootfs: erofs filesystem mounted at / (read-only) -# hashtree: dm-verity hashtree without superblock - -readonly MIB=1048576 - -in=$1 -out=$2 -tmpdir=$(mktemp -d) -trap 'rm -rf $tmpdir' EXIT -rootfs=$tmpdir/01_rootfs -hashtree=$tmpdir/02_verity_hashtree -dm_verity_file=$out/dm_verity.txt -roothash=$out/roothash -raw=$out/raw.img -uuid=c1b9d5a2-f162-11cf-9ece-0020afc76f16 -salt=0102030405060708090a0b0c0d0e0f - -if [ -z "${SOURCE_DATE_EPOCH}" ]; then - echo "SOURCE_DATE_EPOCH is not set" >&2 - exit 1 -fi - -mkdir -p "$out" - -# create the rootfs and pad it to 1MiB -mkfs.erofs \ - -z lz4 \ - -b 4096 \ - -T "$SOURCE_DATE_EPOCH" \ - -U "$uuid" \ - --tar=f \ - "$rootfs" \ - "$in" -truncate -s '%1MiB' "$rootfs" - -# create the dm-verity hashtree -verity_out=$( - veritysetup format \ - "$rootfs" \ - "$hashtree" \ - --data-block-size 4096 \ - --hash-block-size 4096 \ - --no-superblock \ - --uuid "$uuid" \ - --salt "$salt" | tee "$dm_verity_file" -) -# pad the hashtree to multiple of 1MiB -truncate -s '%1MiB' "$hashtree" -# extract dm-verity parameters from text output to individual files -sed -i 1d "$dm_verity_file" -root_hash=$(echo "$verity_out" | grep -oP 'Root hash:\s+\K\w+' | tr -d "[:space:]") -echo -n "$root_hash" >"$roothash" -hash_type=$(echo "$verity_out" | grep -oP 'Hash type:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_type" >"$out/hash_type" -data_blocks=$(echo "$verity_out" | grep -oP 'Data blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_blocks" >"$out/data_blocks" -data_block_size=$(echo "$verity_out" | grep -oP 'Data block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$data_block_size" >"$out/data_block_size" -hash_blocks=$(echo "$verity_out" | grep -oP 'Hash blocks:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_blocks" >"$out/hash_blocks" -hash_block_size=$(echo "$verity_out" | grep -oP 'Hash block size:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_block_size" >"$out/hash_block_size" -hash_algorithm=$(echo "$verity_out" | grep -oP 'Hash algorithm:\s+\K\w+' | tr -d "[:space:]") -echo -n "$hash_algorithm" >"$out/hash_algorithm" -echo -n "$salt" >"$out/salt" - -rootfs_size_mib=$(($(stat -c %s "$rootfs") / "$MIB")) -# full image size is dos header + rootfs + hashtree -hashtree_size_bytes=$(stat -c %s "$hashtree") -hashtree_size_mib=$(($(stat -c %s "$hashtree") / "$MIB")) -# img_size is the size of the full image in bytes -# DOS MBR (padded to 1MiB) + rootfs + hashtree -img_size_bytes=$(("$MIB" + "$rootfs_size_mib" * "$MIB" + "$hashtree_size_bytes")) - -# Where the rootfs starts in MiB -readonly rootfs_start=1 -# hash_start is the start of the hashtree in MiB -hash_start=$((rootfs_start + rootfs_size_mib)) -hash_end=$((hash_start + hashtree_size_mib)) - -rs=$(printf "%4dMiB" "$rootfs_start") -hs=$(printf "%4dMiB" "$hash_start") -he=$(printf "%4dMiB" "$hash_end") -cat < /build/rootfs/etc/kata-opa/default-policy.rego < $out/milan.hex ${lib.getExe python3Packages.sev-snp-measure} \ --mode snp \ @@ -43,11 +48,8 @@ stdenvNoCC.mkDerivation { --vcpus 1 \ --vcpu-type EPYC-Genoa \ --kernel ${kernel} \ - --append '${cmdline}' \ + --initrd ${initrd} \ + --append "${cmdline}" \ --output-format hex > $out/genoa.hex ''; - - passthru = { - inherit dmVerityArgs; - }; } diff --git a/packages/by-name/kata/tdx-launch-digests/package.nix b/packages/by-name/kata/tdx-launch-digests/package.nix index 9e3bb8dbf1..1916462fc7 100644 --- a/packages/by-name/kata/tdx-launch-digests/package.nix +++ b/packages/by-name/kata/tdx-launch-digests/package.nix @@ -11,25 +11,31 @@ debug ? false, }: let - image = kata.kata-image; - inherit (image) dmVerityArgs; - cmdlineBase = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 quiet systemd.show_status=false panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none"; - cmdlineBaseDebug = "tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests net.ifnames=0 pci=lastbus=0 root=/dev/vda1 rootflags=ro rootfstype=erofs console=hvc0 console=hvc1 debug systemd.show_status=true systemd.log_level=debug panic=1 nr_cpus=1 selinux=0 systemd.unit=kata-containers.target systemd.mask=systemd-networkd.service systemd.mask=systemd-networkd.socket scsi_mod.scan=none agent.log=debug agent.debug_console agent.debug_console_vport=1026"; - cmdline = "${if debug then cmdlineBaseDebug else cmdlineBase} ${dmVerityArgs}"; + ovmf-tdx = "${OVMF-TDX}/FV/OVMF.fd"; + kernel = "${kata.kata-image}/bzImage"; + initrd = "${kata.kata-image}/initrd"; + + # Kata uses a base command line and then appends the command line from the kata config (i.e. also our node-installer config). + # Thus, we need to perform the same steps when calculating the digest. + baseCmdline = if debug then kata.kata-runtime.cmdline.debug else kata.kata-runtime.cmdline.default; + cmdline = lib.strings.concatStringsSep " " [ + baseCmdline + kata.kata-image.cmdline + ]; in stdenvNoCC.mkDerivation { name = "tdx-launch-digests"; - inherit (image) version; + inherit (kata.kata-image) version; dontUnpack = true; buildPhase = '' mkdir $out - ${lib.getExe tdx-measure} mrtd -f ${OVMF-TDX}/FV/OVMF.fd > $out/mrtd.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 0 > $out/rtmr0.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 1 > $out/rtmr1.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 2 > $out/rtmr2.hex - ${lib.getExe tdx-measure} rtmr -f ${OVMF-TDX}/FV/OVMF.fd -k ${kata.kata-kernel-uvm}/bzImage -c '${cmdline}' 3 > $out/rtmr3.hex + ${lib.getExe tdx-measure} mrtd -f ${ovmf-tdx} > $out/mrtd.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 0 > $out/rtmr0.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 1 > $out/rtmr1.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 2 > $out/rtmr2.hex + ${lib.getExe tdx-measure} rtmr -f ${ovmf-tdx} -k ${kernel} -i ${initrd} -c '${cmdline}' 3 > $out/rtmr3.hex ''; } diff --git a/packages/by-name/mkNixosConfig/package.nix b/packages/by-name/mkNixosConfig/package.nix index 767761accb..e5b90e109d 100644 --- a/packages/by-name/mkNixosConfig/package.nix +++ b/packages/by-name/mkNixosConfig/package.nix @@ -45,7 +45,11 @@ lib.makeOverridable ( nvidia-ctk-with-config tdx-tools ; - inherit (outerPkgs.kata) kata-agent; + inherit (outerPkgs.kata) + kata-agent + kata-runtime + kata-kernel-uvm + ; }) ]; diff --git a/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch new file mode 100644 index 0000000000..6c26926949 --- /dev/null +++ b/packages/by-name/qemu-tdx-static/0004-hw-x86-load-initrd-to-static-address.patch @@ -0,0 +1,87 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Moritz Sanft <58110325+msanft@users.noreply.github.com> +Date: Thu, 21 Nov 2024 14:36:23 +0100 +Subject: [PATCH] hw/x86: load initrd to static address + +For TDX RTMRs to be predictable regardless of VM memory size, we need to +load the initrd to a static address, so no dynamic value ends up in the +mapped kernel image. + +Without setting this to a static address, the address the initrd is mapped to +which depend not only on the size of the initrd, but also on the memory space +of the guest, this is not viable for Contrast's reference-value-based attestation +approach. + +As we control the minimum VM memory size in Contrast, we just load the initrd +to the address it gets loaded to for Contrast's minimum VM memory (2Gi), regardless +of if the VM has more memory. + +QEMU, by default, does a similar thing. +Consider the below line (cited from above): + + `initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;` + +This adds an artifical upper bound of where the initrd can be loaded to, as +the calculation is based on the VM memory (below_4g_mem_size), but capped at 4Gi. +This means, the initrd, regardless of guest memory size, will always be loaded +at address 0x100000000 (4Gi) max (minus ACPI data size). + +Essentially, overwriting this to 0x80000000 (2Gi), we create an artificial lower *and* +upper bound (set to Contrast minimum TDX VM memory size). +This means that the initrd will *always* be loaded at 0x80000000 (2Gi), minus ACPI +data size. The difference to QEMU's setting is, that we *fix* the address, rather than +setting *only* an upper bound. + +This way, we get the initrd to *always* be loaded at a static address. + +Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> +--- + hw/i386/x86.c | 35 +++++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/hw/i386/x86.c b/hw/i386/x86.c +index 504575abfa98bc25e498e219a2d58d8d31e5feaa..0763462c16f4106d0aa6a46c2b9c360e36ae3e96 100644 +--- a/hw/i386/x86.c ++++ b/hw/i386/x86.c +@@ -953,6 +953,41 @@ void x86_load_linux(X86MachineState *x86ms, + initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; + } + ++ /* ++ * For TDX RTMRs to be predictable regardless of VM memory size, we need to ++ * load the initrd to a static address, so no dynamic value ends up in the ++ * mapped kernel image. ++ * ++ * Without setting this to a static address, the address the initrd is mapped to ++ * which depend not only on the size of the initrd, but also on the memory space ++ * of the guest, this is not viable for Contrast's reference-value-based attestation ++ * approach. ++ * ++ * As we control the minimum VM memory size in Contrast, we just load the initrd ++ * to the address it gets loaded to for Contrast's minimum VM memory (2Gi), regardless ++ * of if the VM has more memory. ++ * ++ * QEMU, by default, does a similar thing. ++ * Consider the below line (cited from above): ++ * ++ * initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1; ++ * ++ * This adds an artifical upper bound of where the initrd can be loaded to, as ++ * the calculation is based on the VM memory (below_4g_mem_size), but capped at 4Gi. ++ * This means, the initrd, regardless of guest memory size, will always be loaded ++ * at address 0x100000000 (4Gi) max (minus ACPI data size). ++ * ++ * Essentially, overwriting this to 0x80000000 (2Gi), we create an artificial lower *and* ++ * upper bound (set to Contrast minimum TDX VM memory size). ++ * This means that the initrd will *always* be loaded at 0x80000000 (2Gi), minus ACPI ++ * data size. The difference to QEMU's setting is, that we *fix* the address, rather than ++ * setting *only* an upper bound. ++ * ++ * This way, we get the initrd to *always* be loaded at a static address. ++ */ ++ uint32_t contrast_min_memory = 0x80000000; // 2Gi ++ initrd_max = contrast_min_memory - acpi_data_size - 1; ++ + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr); + fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1); + fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); diff --git a/packages/by-name/qemu-tdx-static/package.nix b/packages/by-name/qemu-tdx-static/package.nix index 521cf11487..58af1704fa 100644 --- a/packages/by-name/qemu-tdx-static/package.nix +++ b/packages/by-name/qemu-tdx-static/package.nix @@ -61,5 +61,7 @@ in # Make the generated ACPI tables more deterministic, so that we get a # fixed hash for attestation. ./0003-i386-omit-some-unneeded-ACPI-tables.patch + # Load the initrd to a static address to make RTMRs predictable. + ./0004-hw-x86-load-initrd-to-static-address.patch ]; }) diff --git a/packages/nixos/azure.nix b/packages/nixos/azure.nix index 4f2a70a94f..6da4f97e47 100644 --- a/packages/nixos/azure.nix +++ b/packages/nixos/azure.nix @@ -55,6 +55,8 @@ in }; config = lib.mkIf cfg.enable { + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); + boot.initrd = { kernelModules = [ "hv_storvsc" diff --git a/packages/nixos/debug.nix b/packages/nixos/debug.nix index 38958f87f8..19cfac9b0f 100644 --- a/packages/nixos/debug.nix +++ b/packages/nixos/debug.nix @@ -32,7 +32,16 @@ in services.getty.autologinUser = "root"; - boot.kernelParams = [ "console=ttyS0" ]; + # If the image is to be booted locally for testing purposes through + # .#boot-image, or if the image is booted on Peer-pods (on Azure), the + # console should be ttyS0, as this is what Azure and QEMU expose by default to the + # user for reading. However, when one builds a Kata image (e.g. for bare-metal), setting + # console=ttyS0 will break the VM logging (i.e. Kata's "reading guest console"), as this + # only listens on hvc* TTYs. As we have no indicator on whether an image should be booted locally, + # we only set console=ttyS0 when the image is a debug-image and on Peer-pods. So for local + # booting of an image, one needs to remove the optional manually. + boot.kernelParams = lib.optionals config.contrast.peerpods.enable [ "console=ttyS0" ]; + boot.initrd.systemd.emergencyAccess = true; systemd.enableEmergencyMode = true; }; diff --git a/packages/nixos/image.nix b/packages/nixos/image.nix index 192102e5cc..caf834473a 100644 --- a/packages/nixos/image.nix +++ b/packages/nixos/image.nix @@ -1,62 +1,80 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ config, pkgs, ... }: +{ + config, + pkgs, + lib, + ... +}: + +let + cfg = config.contrast.image; +in { - # We build the image with systemd-repart, which integrates well - # with the systemd utilities we use for dm-verity, UKI, etc. - # However, we do not use the repart unit, as we don't want - # dynamic repartitioning at run- / boot-time. - image.repart = { - name = "image-podvm-gpu"; - version = "1-rc1"; + options.contrast.image = { + microVM = lib.mkEnableOption "Build a micro VM image"; + }; - # This defines the actual partition layout. - partitions = { - # EFI System Partition, holds the UKI. - "00-esp" = { - contents = { - "/".source = pkgs.runCommand "esp-contents" { } '' - mkdir -p $out/EFI/BOOT - cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI - ''; - }; - repartConfig = { - Type = "esp"; - Format = "vfat"; - SizeMinBytes = "64M"; - UUID = "null"; # Fix partition UUID for reproducibility. - }; - }; + config = { + system.image.version = "1-rc1"; + + # We build the image with systemd-repart, which integrates well + # with the systemd utilities we use for dm-verity, UKI, etc. + # However, we do not use the repart unit, as we don't want + # dynamic repartitioning at run- / boot-time. + image.repart = { + name = "image-podvm-gpu"; + inherit (config.system.image) version; - # Root filesystem. - "10-root" = { - contents = { - "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + # This defines the actual partition layout. + partitions = { + # EFI System Partition, holds the UKI. + # Only build this partition if we need a bootable image (i.e. not a micro VM). + "00-esp" = lib.mkIf (!cfg.microVM) { + contents = { + "/".source = pkgs.runCommand "esp-contents" { } '' + mkdir -p $out/EFI/BOOT + cp ${config.system.build.uki}/${config.system.boot.loader.ukiFile} $out/EFI/BOOT/BOOTX64.EFI + ''; + }; + repartConfig = { + Type = "esp"; + Format = "vfat"; + SizeMinBytes = "64M"; + UUID = "null"; # Fix partition UUID for reproducibility. + }; }; - storePaths = [ config.system.build.toplevel ]; - repartConfig = { - Type = "root"; - Format = "erofs"; - Label = "root"; - Verity = "data"; - VerityMatchKey = "root"; - Minimize = "best"; - # We need to ensure that mountpoints are available. - # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. - MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + + # Root filesystem. + "10-root" = { + contents = { + "/pause_bundle".source = "${pkgs.pause-bundle}/pause_bundle"; + }; + storePaths = [ config.system.build.toplevel ]; + repartConfig = { + Type = "root"; + Format = "erofs"; + Label = "root"; + Verity = "data"; + VerityMatchKey = "root"; + Minimize = "best"; + # We need to ensure that mountpoints are available. + # TODO (Maybe): This could be done more elegantly with CopyFiles and a skeleton tree in the vcs. + MakeDirectories = "/bin /boot /dev /etc /home /lib /lib64 /mnt /nix /opt /proc /root /run /srv /sys /tmp /usr/bin /var"; + }; }; - }; - # Verity hashes for the root filesystem. - "20-root-verity" = { - repartConfig = { - Type = "root-verity"; - Label = "root-verity"; - Verity = "hash"; - VerityMatchKey = "root"; - Minimize = "best"; + # Verity hashes for the root filesystem. + "20-root-verity" = { + repartConfig = { + Type = "root-verity"; + Label = "root-verity"; + Verity = "hash"; + VerityMatchKey = "root"; + Minimize = "best"; + }; }; }; }; diff --git a/packages/nixos/kata.nix b/packages/nixos/kata.nix index d31681f850..fa85326bff 100644 --- a/packages/nixos/kata.nix +++ b/packages/nixos/kata.nix @@ -1,90 +1,98 @@ # Copyright 2024 Edgeless Systems GmbH # SPDX-License-Identifier: AGPL-3.0-only -{ lib, pkgs, ... }: - { - systemd.services.kata-agent = { - description = "Kata Containers Agent"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" - ]; - bindsTo = [ "netns@podns.service" ]; - wants = [ "process-user-data.service" ]; - after = [ - "netns@podns.service" - "process-user-data.service" - ]; - wantedBy = [ "multi-user.target" ]; - serviceConfig = { - Type = "exec"; # Not upstream. - ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; - ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; - ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; - SyslogIdentifier = "kata-agent"; - }; - environment = { - KATA_AGENT_LOG_LEVEL = "debug"; - OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( - lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } - ); - }; + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.kata; +in +{ + options.contrast.kata = { + enable = lib.mkEnableOption "Enable Kata (non-peerpod) support"; }; - systemd.services.agent-protocol-forwarder = { - description = "Agent Protocol Forwarder"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" - ]; - wants = [ "kata-agent.service" ]; - after = [ "kata-agent.service" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; - }; - serviceConfig = { - Type = "notify"; - ExecStart = lib.strings.concatStringsSep " " [ - "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" - "-kata-agent-namespace /run/netns/podns" - "-kata-agent-socket /run/kata-containers/agent.sock" + config = lib.mkIf cfg.enable { + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-containers.target + systemd.targets.kata-containers = { + description = "Kata Containers Agent Target"; + requires = [ + "basic.target" + "tmp.mount" + "kata-agent.service" ]; - Restart = "on-failure"; - RestartSec = "5s"; + wantedBy = [ "basic.target" ]; + wants = [ + "chronyd.service" + # https://github.com/kata-containers/kata-containers/blob/5869046d04553c3bd2f16fa1cfb714133050e537/tools/osbuilder/rootfs-builder/rootfs.sh#L712 + "dbus.socket" + ]; + conflicts = [ + "rescue.service" + "rescue.target" + ]; + after = [ + "basic.target" + "rescue.service" + "rescue.target" + ]; + unitConfig.AllowIsolate = true; }; - }; - systemd.services.process-user-data = { - description = "Pull configuration from metadata service"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" - ]; - wants = [ "network-online.target" ]; - after = [ "network-online.target" ]; - wantedBy = [ "multi-user.target" ]; - unitConfig = { - DefaultDependencies = false; + # https://github.com/kata-containers/kata-containers/blob/3.10.1/src/agent/kata-agent.service.in + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ "https://github.com/kata-containers/kata-containers" ]; + wants = [ "kata-containers.target" ]; + after = [ "systemd-tmpfiles-setup.service" ]; # Not upstream, but required for /etc/resolv.conf bind mount. + serviceConfig = { + Type = "exec"; # Not upstream. + StandardOutput = "tty"; + ExecStart = "${lib.getExe pkgs.kata-agent}"; + LimitNOFILE = 1048576; + ExecStop = "${pkgs.coreutils}/bin/sync ; ${config.systemd.package}/bin/systemctl --force poweroff"; + FailureAction = "poweroff"; + OOMScoreAdjust = -997; + }; + # Not upstream + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; }; - serviceConfig = { - Type = "oneshot"; - ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; - RemainAfterExit = true; + + fileSystems."/run" = { + fsType = "tmpfs"; + options = [ + "nodev" + "nosuid" + "size=50%" + ]; + neededForBoot = true; }; - }; - systemd.services."netns@" = { - description = "Create a network namespace for pod networking"; - documentation = [ - "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" - ]; - serviceConfig = { - Type = "oneshot"; - RemainAfterExit = true; - ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; - ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; - ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + # Not used directly, but required for kernel-specific driver builds. + boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kata-kernel-uvm); + + boot.initrd = { + # Don't require TPM2 support. (additional modules) + systemd.tpm2.enable = false; + # Don't require any of the hardware modules NixOS includes by default. + includeDefaultModules = false; + }; + + networking.resolvconf.enable = false; + systemd.tmpfiles.settings."10-etc-resolvconf"."/etc/resolv.conf".f = { + group = "root"; + mode = "0755"; + user = "root"; }; - }; - environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + environment.etc."kata-opa/default-policy.rego".source = "${pkgs.kata-runtime.src}/src/kata-opa/allow-set-policy.rego"; + }; } diff --git a/packages/nixos/peerpods.nix b/packages/nixos/peerpods.nix new file mode 100644 index 0000000000..116768e519 --- /dev/null +++ b/packages/nixos/peerpods.nix @@ -0,0 +1,103 @@ +# Copyright 2024 Edgeless Systems GmbH +# SPDX-License-Identifier: AGPL-3.0-only + +{ + config, + lib, + pkgs, + ... +}: +let + cfg = config.contrast.peerpods; +in +{ + options.contrast.peerpods = { + enable = lib.mkEnableOption "Enable peer pods support"; + }; + + config = lib.mkIf cfg.enable { + systemd.services.kata-agent = { + description = "Kata Containers Agent"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/kata-agent.service" + ]; + bindsTo = [ "netns@podns.service" ]; + wants = [ "process-user-data.service" ]; + after = [ + "netns@podns.service" + "process-user-data.service" + ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Type = "exec"; # Not upstream. + ExecStartPre = [ "${pkgs.coreutils}/bin/mkdir -p /run/kata-containers" ]; + ExecStart = "${lib.getExe pkgs.kata-agent} --config /run/peerpod/agent-config.toml"; + ExecStopPost = "${lib.getExe pkgs.cloud-api-adaptor.kata-agent-clean} --config /run/peerpod/agent-config.toml"; + SyslogIdentifier = "kata-agent"; + }; + environment = { + KATA_AGENT_LOG_LEVEL = "debug"; + OCICRYPT_KEYPROVIDER_CONFIG = builtins.toFile "policy.json" ( + lib.strings.toJSON { default = [ { type = "insecureAcceptAnything"; } ]; } + ); + }; + }; + + systemd.services.agent-protocol-forwarder = { + description = "Agent Protocol Forwarder"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/agent-protocol-forwarder.service" + ]; + wants = [ "kata-agent.service" ]; + after = [ "kata-agent.service" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "notify"; + ExecStart = lib.strings.concatStringsSep " " [ + "${pkgs.cloud-api-adaptor}/bin/agent-protocol-forwarder" + "-kata-agent-namespace /run/netns/podns" + "-kata-agent-socket /run/kata-containers/agent.sock" + ]; + Restart = "on-failure"; + RestartSec = "5s"; + }; + }; + + systemd.services.process-user-data = { + description = "Pull configuration from metadata service"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/process-user-data.service" + ]; + wants = [ "network-online.target" ]; + after = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + unitConfig = { + DefaultDependencies = false; + }; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.cloud-api-adaptor}/bin/process-user-data provision-files"; + RemainAfterExit = true; + }; + }; + + systemd.services."netns@" = { + description = "Create a network namespace for pod networking"; + documentation = [ + "https://github.com/confidential-containers/cloud-api-adaptor/blob/main/src/cloud-api-adaptor/podvm/files/etc/systemd/system/netns%40.service" + ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStartPre = "${pkgs.iproute2}/bin/ip netns add %I"; + ExecStart = "${pkgs.iproute2}/bin/ip netns exec %I ${pkgs.iproute2}/bin/ip link set lo up"; + ExecStop = "${pkgs.iproute2}/bin/ip netns del %I"; + }; + }; + + environment.etc."kata-opa/default-policy.rego".source = pkgs.cloud-api-adaptor.default-policy; + }; +} diff --git a/packages/nixos/system.nix b/packages/nixos/system.nix index d11a336acc..553accc304 100644 --- a/packages/nixos/system.nix +++ b/packages/nixos/system.nix @@ -4,20 +4,24 @@ { config, lib, - pkgs, ... }: { boot.loader.grub.enable = false; - boot.kernelPackages = pkgs.recurseIntoAttrs (pkgs.linuxPackagesFor pkgs.kernel-podvm-azure); boot.kernelParams = [ "systemd.verity=yes" "selinux=0" ]; - boot.supportedFilesystems = [ "erofs" ]; + boot.supportedFilesystems = [ + "erofs" + "vfat" + ]; boot.initrd = { - supportedFilesystems = [ "erofs" ]; + supportedFilesystems = [ + "erofs" + "vfat" + ]; availableKernelModules = [ "dm_mod" "dm_verity" diff --git a/tools/tdx-measure/main.go b/tools/tdx-measure/main.go index b43f4db9d9..50f3bdb6a5 100644 --- a/tools/tdx-measure/main.go +++ b/tools/tdx-measure/main.go @@ -106,6 +106,10 @@ func newRtMrCmd() *cobra.Command { if err := cmd.MarkFlagFilename("kernel"); err != nil { panic(err) } + cmd.Flags().StringP("initrd", "i", "initrd.zst", "path to initrd file") + if err := cmd.MarkFlagFilename("initrd"); err != nil { + panic(err) + } cmd.Flags().StringP("cmdline", "c", "", "kernel command line") return cmd } @@ -136,8 +140,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return fmt.Errorf("can't read kernel file: %w", err) } - - digest, err = rtmr.CalcRtmr1(kernel) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr1(kernel, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 1: %w", err) } @@ -146,7 +157,15 @@ func runRtMr(cmd *cobra.Command, args []string) error { if err != nil { return err } - digest, err = rtmr.CalcRtmr2(cmdLine) + initrdPath, err := cmd.Flags().GetString("initrd") + if err != nil { + return err + } + initrd, err := os.ReadFile(initrdPath) + if err != nil { + return fmt.Errorf("can't read initrd file: %w", err) + } + digest, err = rtmr.CalcRtmr2(cmdLine, initrd) if err != nil { return fmt.Errorf("can't calculate RTMR 2: %w", err) } diff --git a/tools/tdx-measure/rtmr/rtmr.go b/tools/tdx-measure/rtmr/rtmr.go index a123f29d9a..a6a3fb71a9 100644 --- a/tools/tdx-measure/rtmr/rtmr.go +++ b/tools/tdx-measure/rtmr/rtmr.go @@ -243,23 +243,33 @@ func CalcRtmr0(firmware []byte) ([48]byte, error) { } // CalcRtmr1 calculates RTMR[1] for the given kernel. -func CalcRtmr1(kernelFile []byte) ([48]byte, error) { +func CalcRtmr1(kernelFile, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr - kernelHashContent, err := hashKernel(kernelFile) + + kernelHashContent, err := hashKernel(kernelFile, initrdFile) if err != nil { return [48]byte{}, fmt.Errorf("can't hash kernel: %w", err) } rtmr.hashAndExtend(kernelHashContent) + + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2155 rtmr.hashAndExtend([]byte("Calling EFI Application from Boot Option")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2243 rtmr.hashAndExtend([]byte("Exit Boot Services Invocation")) + // https://github.com/tianocore/edk2/blob/0f3867fa6ef0553e26c42f7d71ff6bdb98429742/OvmfPkg/Tcg/TdTcg2Dxe/TdTcg2Dxe.c#L2254 rtmr.hashAndExtend([]byte("Exit Boot Services Returned with Success")) return rtmr.Get(), nil } -// CalcRtmr2 calculates RTMR[2] for the given kernel command line. -func CalcRtmr2(cmdLine string) ([48]byte, error) { +// CalcRtmr2 calculates RTMR[2] for the given kernel command line and initrd. +func CalcRtmr2(cmdLine string, initrdFile []byte) ([48]byte, error) { var rtmr Rtmr + // TODO(msanft): find out which component silently adds this string to the commandline. + // Suspects: QEMU-TDX, OVMF-TDX, Linux EFI Stub + cmdLine += " initrd=initrd" + + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L342 codepoints := utf16.Encode([]rune(cmdLine)) bytes := make([]byte, (len(codepoints)+1)*2) for i, codepoint := range codepoints { @@ -267,11 +277,14 @@ func CalcRtmr2(cmdLine string) ([48]byte, error) { } rtmr.hashAndExtend(bytes) + // https://elixir.bootlin.com/linux/v6.11.8/source/drivers/firmware/efi/libstub/efi-stub-helper.c#L625 + rtmr.hashAndExtend(initrdFile) + return rtmr.Get(), nil } -func hashKernel(kernelFile []byte) ([]byte, error) { - patchKernel(kernelFile) +func hashKernel(kernelFile, initrdFile []byte) ([]byte, error) { + patchKernel(kernelFile, initrdFile) kernel, err := authenticode.Parse(bytes.NewReader(kernelFile)) if err != nil { @@ -281,7 +294,7 @@ func hashKernel(kernelFile []byte) ([]byte, error) { return kernel.HashContent.Bytes(), nil } -func patchKernel(kernelFile []byte) { +func patchKernel(kernelFile, initrdFile []byte) { // QEMU patches some header bytes in the kernel before loading it into memory. // Sources: // - https://gitlab.com/qemu-project/qemu/-/blob/28ae3179fc52d2e4d870b635c4a412aab99759e7/hw/i386/x86-common.c#L837 @@ -299,4 +312,19 @@ func patchKernel(kernelFile []byte) { kernelFile[0x229] = 0x00 kernelFile[0x22A] = 0x02 kernelFile[0x22B] = 0x00 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1036 + // Maximum size of the initrd as calculated by QEMU. Normally, this would be dependent on the VM + // memory size, but we have a QEMU patch that removes that fixes this to make RTMR1 reproducible. + // Our QEMU patch has a commented-out line to print this value upon start, so it's easy to find + // when updating QEMU, as the value might change on QEMU updates. + initrdMax := 0x7ffd7fff + initrdSize := len(initrdFile) + initrdAddr := (initrdMax - initrdSize) & ^4095 + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1044 + binary.LittleEndian.PutUint32(kernelFile[0x218:][:4], uint32(initrdAddr)) + + // https://github.com/qemu/qemu/blob/f48c205fb42be48e2e47b7e1cd9a2802e5ca17b0/hw/i386/x86.c#L1045 + binary.LittleEndian.PutUint32(kernelFile[0x21C:][:4], uint32(initrdSize)) }