Skip to content

Commit

Permalink
Add K3s-QEMU-SNP-GPU platform
Browse files Browse the repository at this point in the history
This adds a new `K3s-QEMU-SNP-GPU` platform that can be used for testing GPU-specific features on our on-prem infrastructure that uses this setup. This intentionally does not yet add any specific actions done if that platform is selected.
  • Loading branch information
msanft committed Dec 19, 2024
1 parent 6c6fbaf commit e535a27
Show file tree
Hide file tree
Showing 14 changed files with 34 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
echo "runner=ubuntu-22.04" >> "$GITHUB_OUTPUT"
echo "self-hosted=false" >> "$GITHUB_OUTPUT"
;;
"K3s-QEMU-SNP")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
echo "runner=SNP" >> "$GITHUB_OUTPUT"
echo "self-hosted=true" >> "$GITHUB_OUTPUT"
;;
Expand Down
2 changes: 1 addition & 1 deletion cli/genpolicy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func NewConfig(platform platforms.Platform) *Config {
Settings: aksSettings,
Bin: aksGenpolicyBin,
}
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
return &Config{
Rules: kataRules,
Settings: kataSettings,
Expand Down
2 changes: 1 addition & 1 deletion cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func buildVersionString() (string, error) {
switch platform {
case platforms.AKSCloudHypervisorSNP:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.MicrosoftGenpolicyVersion)
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.KataGenpolicyVersion)
}
}
Expand Down
4 changes: 2 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ func (ct *ContrastTest) patchReferenceValues(t *testing.T, platform platforms.Pl
SNPVersion: toPtr(manifest.SVN(255)),
MicrocodeVersion: toPtr(manifest.SVN(255)),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
// The generate command doesn't fill in all required fields when
// generating a manifest for baremetal SNP. Do that now.
for i, snp := range m.ReferenceValues.SNP {
Expand Down Expand Up @@ -372,7 +372,7 @@ func (ct *ContrastTest) FactorPlatformTimeout(timeout time.Duration) time.Durati
switch ct.Platform {
case platforms.AKSCloudHypervisorSNP: // AKS defined is the baseline
return timeout
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
return 2 * timeout
default:
return timeout
Expand Down
2 changes: 1 addition & 1 deletion internal/kuberesource/parts.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
WithType(corev1.HostPathDirectory),
))
snapshotterVolumes = nydusSnapshotterVolumes
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX:
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
snapshotter = nydusSnapshotter
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes, Volume().
Expand Down
4 changes: 2 additions & 2 deletions internal/manifest/referencevalues.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ func platformFromHandler(handler string) (platforms.Platform, error) {
}

parts := strings.Split(rest, "-")
if len(parts) != 4 {
if len(parts) != 4 && len(parts) != 5 {
return platforms.Unknown, fmt.Errorf("invalid handler name: %s", handler)
}

rawPlatform := fmt.Sprintf("%s-%s-%s", parts[0], parts[1], parts[2])
rawPlatform := strings.Join(parts[:len(parts)-1], "-")

platform, err := platforms.FromString(rawPlatform)
if err != nil {
Expand Down
8 changes: 7 additions & 1 deletion internal/platforms/platforms.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ const (
MetalQEMUSNP
// MetalQEMUTDX is the generic platform for bare-metal TDX deployments.
MetalQEMUTDX
// K3sQEMUSNPGPU represents a deployment with QEMU on bare-metal SNP K3s with GPU passthrough.
K3sQEMUSNPGPU
)

// All returns a list of all available platforms.
func All() []Platform {
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX}
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX, K3sQEMUSNPGPU}
}

// AllStrings returns a list of all available platforms as strings.
Expand All @@ -53,6 +55,8 @@ func (p Platform) String() string {
return "K3s-QEMU-TDX"
case K3sQEMUSNP:
return "K3s-QEMU-SNP"
case K3sQEMUSNPGPU:
return "K3s-QEMU-SNP-GPU"
case RKE2QEMUTDX:
return "RKE2-QEMU-TDX"
case MetalQEMUSNP:
Expand All @@ -73,6 +77,8 @@ func FromString(s string) (Platform, error) {
return K3sQEMUTDX, nil
case "k3s-qemu-snp":
return K3sQEMUSNP, nil
case "k3s-qemu-snp-gpu":
return K3sQEMUSNPGPU, nil
case "rke2-qemu-tdx":
return RKE2QEMUTDX, nil
case "metal-qemu-snp":
Expand Down
12 changes: 6 additions & 6 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ node-installer platform=default_platform:
just push "tardev-snapshotter"
just push "node-installer-microsoft"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
just push "nydus-snapshotter"
just push "node-installer-kata"
;;
Expand Down Expand Up @@ -186,7 +186,7 @@ create-pre platform=default_platform:
# TODO(burgerdev): this should create the resource group for consistency
:
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down Expand Up @@ -215,7 +215,7 @@ create platform=default_platform:
"AKS-CLH-SNP")
nix run -L .#scripts.create-coco-aks -- --name="$azure_resource_group" --location="$azure_location"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down Expand Up @@ -328,7 +328,7 @@ get-credentials platform=default_platform:
"K3s-QEMU-TDX")
nix run -L .#scripts.get-credentials "projects/796962942582/secrets/m50-ganondorf-kubeconf/versions/5"
;;
"K3s-QEMU-SNP")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
nix run -L .#scripts.get-credentials "projects/796962942582/secrets/discovery-kubeconf/versions/2"
;;
*)
Expand All @@ -352,7 +352,7 @@ destroy platform=default_platform:
"AKS-CLH-SNP")
nix run -L .#scripts.destroy-coco-aks -- --name="$azure_resource_group"
;;
"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand All @@ -377,7 +377,7 @@ destroy-post platform=default_platform:
# TODO(burgerdev): this should destroy the resource group for consistency.
:
;;
"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down
2 changes: 1 addition & 1 deletion nodeinstaller/internal/config/kata_runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestKataConfig(t *testing.T) {
assert.Contains(string(configBytes), "[Runtime]")

switch platform {
case platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX:
assert.Contains(string(configBytes), "[Hypervisor.qemu]")
case platforms.AKSCloudHypervisorSNP:
assert.Contains(string(configBytes), "[Hypervisor.clh]")
Expand Down
4 changes: 2 additions & 2 deletions nodeinstaller/internal/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
if debug {
config.Hypervisor["qemu"]["enable_debug"] = true
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil {
return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err)
}
Expand Down Expand Up @@ -133,7 +133,7 @@ func ContainerdRuntimeConfigFragment(baseDir, snapshotter string, platform platf
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-tdx.toml"),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-snp.toml"),
}
Expand Down
6 changes: 3 additions & 3 deletions nodeinstaller/node-installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
case platforms.MetalQEMUTDX:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-tdx.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.K3sQEMUSNP:
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "var", "lib", "rancher", "k3s", "agent", "etc", "containerd", "config.toml.tmpl")
case platforms.K3sQEMUTDX:
Expand Down Expand Up @@ -147,7 +147,7 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
switch platform {
case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX:
return restartHostContainerd(containerdConfigPath, "containerd")
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP:
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
if hostServiceExists("k3s") {
return restartHostContainerd(containerdConfigPath, "k3s")
} else if hostServiceExists("k3s-agent") {
Expand Down Expand Up @@ -212,7 +212,7 @@ func patchContainerdConfig(runtimeHandler, basePath, configPath string, platform
case platforms.AKSCloudHypervisorSNP:
snapshotterName = fmt.Sprintf("tardev-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/tardev-snapshotter-%s.sock", runtimeHandler)
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
snapshotterName = fmt.Sprintf("nydus-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/containerd-nydus-grpc-%s.sock", runtimeHandler)

Expand Down
4 changes: 4 additions & 0 deletions nodeinstaller/node-installer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ func TestPatchContainerdConfig(t *testing.T) {
platform: platforms.K3sQEMUSNP,
expected: expectedConfBareMetalQEMUSNP,
},
"BareMetalQEMUSNPGPU": {
platform: platforms.K3sQEMUSNPGPU,
expected: expectedConfBareMetalQEMUSNP,
},
"Unknown": {
platform: platforms.Unknown,
wantErr: true,
Expand Down
2 changes: 2 additions & 0 deletions packages/by-name/contrast/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ let
rke2-qemu-tdx-handler = runtimeHandler "rke2-qemu-tdx" kata.contrast-node-installer-image.runtimeHash;
metal-qemu-snp-handler = runtimeHandler "metal-qemu-snp" kata.contrast-node-installer-image.runtimeHash;
k3s-qemu-snp-handler = runtimeHandler "k3s-qemu-snp" kata.contrast-node-installer-image.runtimeHash;
k3s-qemu-snp-gpu-handler = runtimeHandler "k3s-qemu-snp-gpu" kata.contrast-node-installer-image.runtimeHash;

aksRefVals = {
snp = [
Expand Down Expand Up @@ -135,6 +136,7 @@ let
"${rke2-qemu-tdx-handler}" = tdxRefVals;
"${metal-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
}
);

Expand Down
2 changes: 1 addition & 1 deletion packages/scripts.nix
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@
cp ${pkgs.microsoft.genpolicy.settings-coordinator}/genpolicy-settings.json .
${pkgs.microsoft.genpolicy}/bin/genpolicy < "$tmpdir/coordinator_base.yml"
;;
"k3s-qemu-snp"|"k3s-qemu-tdx"|"rke2-qemu-tdx")
"k3s-qemu-snp"|"k3s-qemu-snp-gpu"|"k3s-qemu-tdx"|"rke2-qemu-tdx")
cp ${pkgs.kata.genpolicy.rules-coordinator}/genpolicy-rules.rego rules.rego
cp ${pkgs.kata.genpolicy.settings-coordinator}/genpolicy-settings.json .
${pkgs.kata.genpolicy}/bin/genpolicy < "$tmpdir/coordinator_base.yml"
Expand Down

0 comments on commit e535a27

Please sign in to comment.