Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add {Metal,K3s}-QEMU-SNP-GPU platforms #1091

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
echo "runner=ubuntu-22.04" >> "$GITHUB_OUTPUT"
echo "self-hosted=false" >> "$GITHUB_OUTPUT"
;;
"K3s-QEMU-SNP")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
echo "runner=SNP" >> "$GITHUB_OUTPUT"
echo "self-hosted=true" >> "$GITHUB_OUTPUT"
;;
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ jobs:
- name: Create coordinator resource definitions
run: |
mkdir -p workspace
for platform in aks-clh-snp metal-qemu-tdx k3s-qemu-tdx metal-qemu-snp k3s-qemu-snp rke2-qemu-tdx; do
for platform in aks-clh-snp metal-qemu-tdx k3s-qemu-tdx metal-qemu-snp k3s-qemu-snp k3s-qemu-snp-gpu rke2-qemu-tdx metal-qemu-snp-gpu; do
nix run .#scripts.write-coordinator-yaml -- "${coordinatorImgTagged}" "${platform}" > workspace/coordinator-$platform.yml
echo -n "${platform} " >> workspace/coordinator-policy.hash
yq < workspace/coordinator-$platform.yml \
Expand Down
4 changes: 3 additions & 1 deletion cli/genpolicy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ func NewConfig(platform platforms.Platform) *Config {
Settings: aksSettings,
Bin: aksGenpolicyBin,
}
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
return &Config{
Rules: kataRules,
Settings: kataSettings,
Expand Down
4 changes: 3 additions & 1 deletion cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ func buildVersionString() (string, error) {
switch platform {
case platforms.AKSCloudHypervisorSNP:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.MicrosoftGenpolicyVersion)
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUTDX, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.KataGenpolicyVersion)
}
}
Expand Down
6 changes: 4 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ func patchReferenceValues(platform platforms.Platform) PatchManifestFunc {
SNPVersion: toPtr(manifest.SVN(255)),
MicrocodeVersion: toPtr(manifest.SVN(255)),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.MetalQEMUSNPGPU, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
// The generate command doesn't fill in all required fields when
// generating a manifest for baremetal SNP. Do that now.
for i, snp := range m.ReferenceValues.SNP {
Expand Down Expand Up @@ -381,7 +381,9 @@ func (ct *ContrastTest) FactorPlatformTimeout(timeout time.Duration) time.Durati
switch ct.Platform {
case platforms.AKSCloudHypervisorSNP: // AKS defined is the baseline
return timeout
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
return 2 * timeout
default:
return timeout
Expand Down
4 changes: 2 additions & 2 deletions internal/kuberesource/parts.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-microsoft:latest"
snapshotter = tardevSnapshotter
snapshotterVolumes = tardevSnapshotterVolumes
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.MetalQEMUSNPGPU:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
snapshotter = nydusSnapshotter
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes, Volume().
Expand All @@ -136,7 +136,7 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
WithType(corev1.HostPathDirectory),
))
snapshotterVolumes = nydusSnapshotterVolumes
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX:
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
snapshotter = nydusSnapshotter
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes, Volume().
Expand Down
4 changes: 2 additions & 2 deletions internal/manifest/referencevalues.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ func platformFromHandler(handler string) (platforms.Platform, error) {
}

parts := strings.Split(rest, "-")
if len(parts) != 4 {
if len(parts) != 4 && len(parts) != 5 {
return platforms.Unknown, fmt.Errorf("invalid handler name: %s", handler)
}

rawPlatform := fmt.Sprintf("%s-%s-%s", parts[0], parts[1], parts[2])
rawPlatform := strings.Join(parts[:len(parts)-1], "-")

platform, err := platforms.FromString(rawPlatform)
if err != nil {
Expand Down
14 changes: 13 additions & 1 deletion internal/platforms/platforms.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@ const (
MetalQEMUSNP
// MetalQEMUTDX is the generic platform for bare-metal TDX deployments.
MetalQEMUTDX
// K3sQEMUSNPGPU represents a deployment with QEMU on bare-metal SNP K3s with GPU passthrough.
K3sQEMUSNPGPU
// MetalQEMUSNPGPU is the generic platform for bare-metal SNP deployments with GPU passthrough.
MetalQEMUSNPGPU
)

// All returns a list of all available platforms.
func All() []Platform {
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX}
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX, K3sQEMUSNPGPU, MetalQEMUSNPGPU}
}

// AllStrings returns a list of all available platforms as strings.
Expand All @@ -53,10 +57,14 @@ func (p Platform) String() string {
return "K3s-QEMU-TDX"
case K3sQEMUSNP:
return "K3s-QEMU-SNP"
case K3sQEMUSNPGPU:
return "K3s-QEMU-SNP-GPU"
case RKE2QEMUTDX:
return "RKE2-QEMU-TDX"
case MetalQEMUSNP:
return "Metal-QEMU-SNP"
case MetalQEMUSNPGPU:
return "Metal-QEMU-SNP-GPU"
case MetalQEMUTDX:
return "Metal-QEMU-TDX"
default:
Expand All @@ -73,10 +81,14 @@ func FromString(s string) (Platform, error) {
return K3sQEMUTDX, nil
case "k3s-qemu-snp":
return K3sQEMUSNP, nil
case "k3s-qemu-snp-gpu":
return K3sQEMUSNPGPU, nil
case "rke2-qemu-tdx":
return RKE2QEMUTDX, nil
case "metal-qemu-snp":
return MetalQEMUSNP, nil
case "metal-qemu-snp-gpu":
return MetalQEMUSNPGPU, nil
case "metal-qemu-tdx":
return MetalQEMUTDX, nil
default:
Expand Down
14 changes: 7 additions & 7 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ node-installer platform=default_platform:
just push "tardev-snapshotter"
just push "node-installer-microsoft"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
just push "nydus-snapshotter"
just push "node-installer-kata"
;;
Expand Down Expand Up @@ -117,7 +117,7 @@ generate cli=default_cli platform=default_platform:

# On baremetal SNP, we don't have default values for MinimumTCB, so we need to set some here.
case {{ platform }} in
"Metal-QEMU-SNP"|"K3s-QEMU-SNP")
"Metal-QEMU-SNP"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
yq --inplace \
'.ReferenceValues.snp.[].MinimumTCB = {"BootloaderVersion":0,"TEEVersion":0,"SNPVersion":0,"MicrocodeVersion":0}' \
{{ workspace_dir }}/manifest.json
Expand Down Expand Up @@ -186,7 +186,7 @@ create-pre platform=default_platform:
# TODO(burgerdev): this should create the resource group for consistency
:
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down Expand Up @@ -215,7 +215,7 @@ create platform=default_platform:
"AKS-CLH-SNP")
nix run -L .#scripts.create-coco-aks -- --name="$azure_resource_group" --location="$azure_location"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down Expand Up @@ -328,7 +328,7 @@ get-credentials platform=default_platform:
"K3s-QEMU-TDX")
nix run -L .#scripts.get-credentials "projects/796962942582/secrets/m50-ganondorf-kubeconf/versions/5"
;;
"K3s-QEMU-SNP")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
nix run -L .#scripts.get-credentials "projects/796962942582/secrets/discovery-kubeconf/versions/2"
;;
*)
Expand All @@ -352,7 +352,7 @@ destroy platform=default_platform:
"AKS-CLH-SNP")
nix run -L .#scripts.destroy-coco-aks -- --name="$azure_resource_group"
;;
"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand All @@ -377,7 +377,7 @@ destroy-post platform=default_platform:
# TODO(burgerdev): this should destroy the resource group for consistency.
:
;;
"K3s-QEMU-SNP"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down
4 changes: 3 additions & 1 deletion nodeinstaller/internal/config/kata_runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ func TestKataConfig(t *testing.T) {
assert.Contains(string(configBytes), "[Runtime]")

switch platform {
case platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX,
platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
assert.Contains(string(configBytes), "[Hypervisor.qemu]")
case platforms.AKSCloudHypervisorSNP:
assert.Contains(string(configBytes), "[Hypervisor.clh]")
Expand Down
18 changes: 16 additions & 2 deletions nodeinstaller/internal/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
if debug {
config.Hypervisor["qemu"]["enable_debug"] = true
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU,
platforms.MetalQEMUSNPGPU:
if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil {
return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err)
}
Expand All @@ -94,6 +95,14 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
if debug {
config.Hypervisor["qemu"]["enable_debug"] = true
}
// GPU-specific settings
if platform == platforms.K3sQEMUSNPGPU || platform == platforms.MetalQEMUSNPGPU {
config.Hypervisor["qemu"]["guest_hook_path"] = "/usr/share/oci/hooks"
config.Hypervisor["qemu"]["cold_plug_vfio"] = "root-port"
// GPU images tend to be larger, so give a better default timeout that
// allows for pulling those.
config.Runtime["create_container_timeout"] = 600
}
default:
return nil, fmt.Errorf("unsupported platform: %s", platform)
}
Expand Down Expand Up @@ -133,10 +142,15 @@ func ContainerdRuntimeConfigFragment(baseDir, snapshotter string, platform platf
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-tdx.toml"),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU,
platforms.MetalQEMUSNPGPU:
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-snp.toml"),
}
// For GPU support, we need to pass through the CDI annotations.
if platform == platforms.K3sQEMUSNPGPU || platform == platforms.MetalQEMUSNPGPU {
cfg.PodAnnotations = append(cfg.PodAnnotations, "cdi.k8s.io/*")
}
default:
return nil, fmt.Errorf("unsupported platform: %s", platform)
}
Expand Down
13 changes: 8 additions & 5 deletions nodeinstaller/node-installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,13 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
case platforms.AKSCloudHypervisorSNP:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-clh-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.MetalQEMUSNP:
case platforms.MetalQEMUSNP, platforms.MetalQEMUSNPGPU:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.MetalQEMUTDX:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-tdx.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.K3sQEMUSNP:
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "var", "lib", "rancher", "k3s", "agent", "etc", "containerd", "config.toml.tmpl")
case platforms.K3sQEMUTDX:
Expand Down Expand Up @@ -145,9 +145,10 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
}

switch platform {
case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX:
case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX,
platforms.MetalQEMUSNPGPU:
return restartHostContainerd(containerdConfigPath, "containerd")
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP:
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
if hostServiceExists("k3s") {
return restartHostContainerd(containerdConfigPath, "k3s")
} else if hostServiceExists("k3s-agent") {
Expand Down Expand Up @@ -212,7 +213,9 @@ func patchContainerdConfig(runtimeHandler, basePath, configPath string, platform
case platforms.AKSCloudHypervisorSNP:
snapshotterName = fmt.Sprintf("tardev-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/tardev-snapshotter-%s.sock", runtimeHandler)
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX,
platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
snapshotterName = fmt.Sprintf("nydus-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/containerd-nydus-grpc-%s.sock", runtimeHandler)

Expand Down
10 changes: 8 additions & 2 deletions nodeinstaller/node-installer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ var (
expectedConfBareMetalQEMUTDX []byte
//go:embed testdata/expected-bare-metal-qemu-snp.toml
expectedConfBareMetalQEMUSNP []byte
//go:embed testdata/expected-bare-metal-qemu-snp-gpu.toml
expectedConfBareMetalQEMUSNPGPU []byte
)

func TestPatchContainerdConfig(t *testing.T) {
Expand All @@ -34,14 +36,18 @@ func TestPatchContainerdConfig(t *testing.T) {
platform: platforms.AKSCloudHypervisorSNP,
expected: expectedConfAKSCLHSNP,
},
"BareMetalQEMUTDX": {
"K3sQEMUTDX": {
platform: platforms.K3sQEMUTDX,
expected: expectedConfBareMetalQEMUTDX,
},
"BareMetalQEMUSNP": {
"K3sQEMUSNP": {
platform: platforms.K3sQEMUSNP,
expected: expectedConfBareMetalQEMUSNP,
},
"K3sQEMUSNPGPU": {
platform: platforms.K3sQEMUSNPGPU,
expected: expectedConfBareMetalQEMUSNPGPU,
},
"Unknown": {
platform: platforms.Unknown,
wantErr: true,
Expand Down
Loading
Loading