Skip to content

Commit

Permalink
Add Metal-QEMU-SNP-GPU platform
Browse files Browse the repository at this point in the history
This adds a new `Metal-QEMU-SNP-GPU` platform that can be used for testing GPU-specific features on Scaleway and other non-K3s environments. This intentionally does not yet add any specific actions done if that platform is selected.
  • Loading branch information
msanft committed Dec 30, 2024
1 parent 098e918 commit aa541e2
Show file tree
Hide file tree
Showing 11 changed files with 41 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ jobs:
- name: Create coordinator resource definitions
run: |
mkdir -p workspace
for platform in aks-clh-snp k3s-qemu-tdx k3s-qemu-snp k3s-qemu-snp-gpu rke2-qemu-tdx; do
for platform in aks-clh-snp k3s-qemu-tdx k3s-qemu-snp k3s-qemu-snp-gpu rke2-qemu-tdx metal-qemu-snp-gpu; do
nix run .#scripts.write-coordinator-yaml -- "${coordinatorImgTagged}" "${platform}" > workspace/coordinator-$platform.yml
echo -n "${platform} " >> workspace/coordinator-policy.hash
yq < workspace/coordinator-$platform.yml \
Expand Down
4 changes: 3 additions & 1 deletion cli/genpolicy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ func NewConfig(platform platforms.Platform) *Config {
Settings: aksSettings,
Bin: aksGenpolicyBin,
}
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
return &Config{
Rules: kataRules,
Settings: kataSettings,
Expand Down
4 changes: 3 additions & 1 deletion cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ func buildVersionString() (string, error) {
switch platform {
case platforms.AKSCloudHypervisorSNP:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.MicrosoftGenpolicyVersion)
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUTDX, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
fmt.Fprintf(versionsWriter, "\tgenpolicy version:\t%s\n", constants.KataGenpolicyVersion)
}
}
Expand Down
7 changes: 5 additions & 2 deletions e2e/internal/contrasttest/contrasttest.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ func (ct *ContrastTest) patchReferenceValues(t *testing.T, platform platforms.Pl
SNPVersion: toPtr(manifest.SVN(255)),
MicrocodeVersion: toPtr(manifest.SVN(255)),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU,
platforms.MetalQEMUSNPGPU:
// The generate command doesn't fill in all required fields when
// generating a manifest for baremetal SNP. Do that now.
for i, snp := range m.ReferenceValues.SNP {
Expand Down Expand Up @@ -372,7 +373,9 @@ func (ct *ContrastTest) FactorPlatformTimeout(timeout time.Duration) time.Durati
switch ct.Platform {
case platforms.AKSCloudHypervisorSNP: // AKS defined is the baseline
return timeout
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.K3sQEMUSNP,
platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
return 2 * timeout
default:
return timeout
Expand Down
2 changes: 1 addition & 1 deletion internal/kuberesource/parts.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func NodeInstaller(namespace string, platform platforms.Platform) (*NodeInstalle
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-microsoft:latest"
snapshotter = tardevSnapshotter
snapshotterVolumes = tardevSnapshotterVolumes
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX:
case platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.MetalQEMUSNPGPU:
nodeInstallerImageURL = "ghcr.io/edgelesssys/contrast/node-installer-kata:latest"
snapshotter = nydusSnapshotter
nydusSnapshotterVolumes = append(nydusSnapshotterVolumes, Volume().
Expand Down
8 changes: 7 additions & 1 deletion internal/platforms/platforms.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ const (
MetalQEMUTDX
// K3sQEMUSNPGPU represents a deployment with QEMU on bare-metal SNP K3s with GPU passthrough.
K3sQEMUSNPGPU
// MetalQEMUSNPGPU is the generic platform for bare-metal SNP deployments with GPU passthrough.
MetalQEMUSNPGPU
)

// All returns a list of all available platforms.
func All() []Platform {
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX, K3sQEMUSNPGPU}
return []Platform{AKSCloudHypervisorSNP, K3sQEMUTDX, K3sQEMUSNP, RKE2QEMUTDX, MetalQEMUSNP, MetalQEMUTDX, K3sQEMUSNPGPU, MetalQEMUSNPGPU}
}

// AllStrings returns a list of all available platforms as strings.
Expand All @@ -61,6 +63,8 @@ func (p Platform) String() string {
return "RKE2-QEMU-TDX"
case MetalQEMUSNP:
return "Metal-QEMU-SNP"
case MetalQEMUSNPGPU:
return "Metal-QEMU-SNP-GPU"
case MetalQEMUTDX:
return "Metal-QEMU-TDX"
default:
Expand All @@ -83,6 +87,8 @@ func FromString(s string) (Platform, error) {
return RKE2QEMUTDX, nil
case "metal-qemu-snp":
return MetalQEMUSNP, nil
case "metal-qemu-snp-gpu":
return MetalQEMUSNPGPU, nil
case "metal-qemu-tdx":
return MetalQEMUTDX, nil
default:
Expand Down
8 changes: 4 additions & 4 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ node-installer platform=default_platform:
just push "tardev-snapshotter"
just push "node-installer-microsoft"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
just push "nydus-snapshotter"
just push "node-installer-kata"
;;
Expand Down Expand Up @@ -117,7 +117,7 @@ generate cli=default_cli platform=default_platform:
# On baremetal SNP, we don't have default values for MinimumTCB, so we need to set some here.
case {{ platform }} in
"Metal-QEMU-SNP"|"K3s-QEMU-SNP")
"Metal-QEMU-SNP"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU")
yq --inplace \
'.ReferenceValues.snp.[].MinimumTCB = {"BootloaderVersion":0,"TEEVersion":0,"SNPVersion":0,"MicrocodeVersion":0}' \
{{ workspace_dir }}/manifest.json
Expand Down Expand Up @@ -186,7 +186,7 @@ create-pre platform=default_platform:
# TODO(burgerdev): this should create the resource group for consistency
:
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down Expand Up @@ -215,7 +215,7 @@ create platform=default_platform:
"AKS-CLH-SNP")
nix run -L .#scripts.create-coco-aks -- --name="$azure_resource_group" --location="$azure_location"
;;
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
"Metal-QEMU-SNP"|"Metal-QEMU-TDX"|"Metal-QEMU-SNP-GPU"|"K3s-QEMU-SNP"|"K3s-QEMU-SNP-GPU"|"K3s-QEMU-TDX"|"RKE2-QEMU-TDX")
:
;;
"AKS-PEER-SNP")
Expand Down
4 changes: 3 additions & 1 deletion nodeinstaller/internal/config/kata_runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ func TestKataConfig(t *testing.T) {
assert.Contains(string(configBytes), "[Runtime]")

switch platform {
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX:
case platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.K3sQEMUTDX,
platforms.MetalQEMUSNP, platforms.MetalQEMUTDX, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
assert.Contains(string(configBytes), "[Hypervisor.qemu]")
case platforms.AKSCloudHypervisorSNP:
assert.Contains(string(configBytes), "[Hypervisor.clh]")
Expand Down
10 changes: 6 additions & 4 deletions nodeinstaller/internal/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
if debug {
config.Hypervisor["qemu"]["enable_debug"] = true
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU,
platforms.MetalQEMUSNPGPU:
if err := toml.Unmarshal([]byte(kataBareMetalQEMUSNPBaseConfig), &config); err != nil {
return nil, fmt.Errorf("failed to unmarshal kata runtime configuration: %w", err)
}
Expand All @@ -95,7 +96,7 @@ func KataRuntimeConfig(baseDir string, platform platforms.Platform, qemuExtraKer
config.Hypervisor["qemu"]["enable_debug"] = true
}
// GPU-specific settings
if platform == platforms.K3sQEMUSNPGPU {
if platform == platforms.K3sQEMUSNPGPU || platform == platforms.MetalQEMUSNPGPU {
config.Hypervisor["qemu"]["guest_hook_path"] = "/usr/share/oci/hooks"
config.Hypervisor["qemu"]["cold_plug_vfio"] = "root-port"
// GPU images tend to be larger, so give a better default timeout that
Expand Down Expand Up @@ -141,12 +142,13 @@ func ContainerdRuntimeConfigFragment(baseDir, snapshotter string, platform platf
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-tdx.toml"),
}
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
case platforms.MetalQEMUSNP, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU,
platforms.MetalQEMUSNPGPU:
cfg.Options = map[string]any{
"ConfigPath": filepath.Join(baseDir, "etc", "configuration-qemu-snp.toml"),
}
// For GPU support, we need to pass through the CDI annotations.
if platform == platforms.K3sQEMUSNPGPU {
if platform == platforms.K3sQEMUSNPGPU || platform == platforms.MetalQEMUSNPGPU {
cfg.PodAnnotations = append(cfg.PodAnnotations, "cdi.k8s.io/*")
}
default:
Expand Down
9 changes: 6 additions & 3 deletions nodeinstaller/node-installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
case platforms.AKSCloudHypervisorSNP:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-clh-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.MetalQEMUSNP:
case platforms.MetalQEMUSNP, platforms.MetalQEMUSNPGPU:
kataConfigPath = filepath.Join(kataConfigPath, "configuration-qemu-snp.toml")
containerdConfigPath = filepath.Join(hostMount, "etc", "containerd", "config.toml")
case platforms.MetalQEMUTDX:
Expand Down Expand Up @@ -145,7 +145,8 @@ func run(ctx context.Context, fetcher assetFetcher, platform platforms.Platform,
}

switch platform {
case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX:
case platforms.AKSCloudHypervisorSNP, platforms.MetalQEMUSNP, platforms.MetalQEMUTDX,
platforms.MetalQEMUSNPGPU:
return restartHostContainerd(containerdConfigPath, "containerd")
case platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU:
if hostServiceExists("k3s") {
Expand Down Expand Up @@ -212,7 +213,9 @@ func patchContainerdConfig(runtimeHandler, basePath, configPath string, platform
case platforms.AKSCloudHypervisorSNP:
snapshotterName = fmt.Sprintf("tardev-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/tardev-snapshotter-%s.sock", runtimeHandler)
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX, platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX:
case platforms.MetalQEMUTDX, platforms.MetalQEMUSNP, platforms.K3sQEMUTDX,
platforms.K3sQEMUSNP, platforms.K3sQEMUSNPGPU, platforms.RKE2QEMUTDX,
platforms.MetalQEMUSNPGPU:
snapshotterName = fmt.Sprintf("nydus-%s", runtimeHandler)
socketName = fmt.Sprintf("/run/containerd/containerd-nydus-grpc-%s.sock", runtimeHandler)

Expand Down
3 changes: 2 additions & 1 deletion packages/by-name/contrast/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ let
k3s-qemu-tdx-handler = runtimeHandler "k3s-qemu-tdx" kata.contrast-node-installer-image.runtimeHash;
rke2-qemu-tdx-handler = runtimeHandler "rke2-qemu-tdx" kata.contrast-node-installer-image.runtimeHash;
metal-qemu-snp-handler = runtimeHandler "metal-qemu-snp" kata.contrast-node-installer-image.runtimeHash;
metal-qemu-snp-gpu-handler = runtimeHandler "metal-qemu-snp-gpu" kata.contrast-node-installer-image.runtimeHash;
k3s-qemu-snp-handler = runtimeHandler "k3s-qemu-snp" kata.contrast-node-installer-image.runtimeHash;
k3s-qemu-snp-gpu-handler = runtimeHandler "k3s-qemu-snp-gpu" kata.contrast-node-installer-image.runtimeHash;

aksRefVals = {
snp = [
{
Expand Down Expand Up @@ -135,6 +135,7 @@ let
"${k3s-qemu-tdx-handler}" = tdxRefVals;
"${rke2-qemu-tdx-handler}" = tdxRefVals;
"${metal-qemu-snp-handler}" = snpRefVals;
"${metal-qemu-snp-gpu-handler}" = snpRefVals;
"${k3s-qemu-snp-handler}" = snpRefVals;
"${k3s-qemu-snp-gpu-handler}" = snpRefVals;
}
Expand Down

0 comments on commit aa541e2

Please sign in to comment.