Skip to content

Commit

Permalink
gpu-driver helm chart configurable node-pool name
Browse files Browse the repository at this point in the history
  • Loading branch information
tmilos77 committed Feb 20, 2025
1 parent 70832dd commit ad667fa
Show file tree
Hide file tree
Showing 14 changed files with 212 additions and 10 deletions.
31 changes: 30 additions & 1 deletion charts/gpu-driver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,41 @@ If you had already added this repo earlier, run `helm repo update` to retrieve
the latest versions of the packages. You can then run
`helm search repo kyma-gpu-driver` to see the charts.

## Installation

To install the `gpu-driver` chart:

### 1. Find the name of the node pool where you want to install the GPU driver

```shell
helm install my-gpu-driver kyma-gpu-driver/gpu-driver
NODE_POOL="my-node-pool" # this must match you node pool name
```

You can find the node pool name in the node label `worker.gardener.cloud/pool`.
To get the list of node pool names you can run:

```shell
kubectl get nodes -o jsonpath="{range .items[*]}{.metadata.labels.worker\.gardener\.cloud/pool}{'\n'}{end}" | uniq
```


### 2. Find the kernel version of the nodes in the choosen node pool

```shell
KERNEL_VERSION=$(kubectl get nodes -l worker.gardener.cloud/pool=$NODE_POOL \
-o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{end}' | head)
```


### 3. Upgrade/install the helm chart for specified node pool and kernel version

```shell
helm upgrade --install my-gpu-driver kyma-gpu-driver/gpu-driver \
--set kernelVersion=$KERNEL_VERSION --set nodePool=$NODE_POOL
```

## Uninstall

To uninstall the chart:

```shell
Expand Down
34 changes: 32 additions & 2 deletions charts/gpu-driver/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@

{{- define "gpu-driver.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}-installer
{{- default .Release.Name .Values.nameOverride | trunc 53 | trimSuffix "-" -}}-installer
{{- end -}}

{{- define "device-plugin.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 53 | trimSuffix "-" -}}-device-plugin
{{- default .Release.Name .Values.nameOverride | trunc 53 | trimSuffix "-" -}}-device-plugin
{{- end -}}

{{- define "node-labeler.name" -}}
{{- default .Release.Name .Values.nameOverride | trunc 53 | trimSuffix "-" -}}-node-labeler
{{- end -}}

{{- define "rbac.name" -}}
{{- default .Release.Name .Values.nameOverride | trunc 53 | trimSuffix "-" -}}
{{- end -}}


{{- define "gardenlinux.version" -}}
{{- if .Values.kernelVersion -}}
{{- if not (hasKey .Values.kernelVersions .Values.kernelVersion) }}{{ fail (printf "Unknown kernel version '%s'" .Values.kernelVersion) }}{{ end -}}
{{- get .Values.kernelVersions .Values.kernelVersion -}}
{{- else if .Values.gardenlinux.versionOverride -}}
{{- else -}}
{{- fail ".kernelVersion or .gardenlinux.versionOverride must be set" -}}
{{- end -}}
{{- end -}}


{{- define "image-pull-secrets" -}}
{{- with .Values.imagePullSecrets }}
{{- toYaml . | nindent 8 }}
Expand All @@ -16,3 +36,13 @@
{{- end }}
{{- end -}}

{{- define "node-selector" -}}
{{- $dict := .Values.nodeSelector }}
{{- if .Values.nodePool -}}
{{- $dict = set $dict "worker.gardener.cloud/pool" .Values.nodePool -}}
{{- end -}}
{{- if $dict -}}
nodeSelector:
{{ toYaml $dict | indent 8 }}
{{- end -}}
{{- end -}}
1 change: 1 addition & 0 deletions charts/gpu-driver/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ template "gpu-driver.name" . }}
namespace: {{ .Release.Namespace }}
data:
{{- range $path, $_ := .Files.Glob "files/gardenlinux-nvidia-installer/*.sh" }}
{{ base $path }}: |-
Expand Down
12 changes: 6 additions & 6 deletions charts/gpu-driver/templates/driver-installer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ kind: DaemonSet
metadata:
name: {{ template "gpu-driver.name" . }}
namespace: {{ .Release.Namespace }}
labels:
gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }}
spec:
updateStrategy:
type: OnDelete
Expand All @@ -13,6 +15,7 @@ spec:
metadata:
labels:
name: {{ template "gpu-driver.name" . }}
gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }}
spec:
priorityClassName: system-node-critical
{{- if .Values.nodeAffinity }}
Expand All @@ -21,14 +24,14 @@ spec:
{{ toYaml .Values.nodeAffinity | indent 10 }}
{{- end }}
hostPID: true
imagePullSecrets: {{ template "image-pull-secrets"}}
imagePullSecrets: {{ template "image-pull-secrets" . }}
containers:
- name: gpu-driver
{{- $imageUri := .Values.gardenlinux.image }}
{{- if .Values.gardenlinux.imageRegistry }}
{{ $imageUri = printf "%s/%s" .Values.gardenlinux.imageRegistry $imageUri }}
{{- end }}
image: {{ $imageUri }}:{{ required "gardenlinux.version" .Values.gardenlinux.version }}
image: {{ $imageUri }}:{{ template "gardenlinux.version" . }}
workingDir: "/work"
command:
- "/work/entrypoint.sh"
Expand Down Expand Up @@ -62,10 +65,7 @@ spec:
mountPath: /opt/nvidia-installer/cache
- name: module-install-dir-base
mountPath: {{ required "nvidia.installer.hostDriverPath" .Values.nvidia.installer.hostDriverPath }}
{{- if .Values.nodeSelector }}
nodeSelector:
{{ toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
{{ template "node-selector" . }}
{{- if .Values.tolerations }}
tolerations:
{{ toYaml .Values.tolerations | indent 6 }}
Expand Down
19 changes: 18 additions & 1 deletion charts/gpu-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ imagePullSecrets: []
# Set to true to output all shell commands (enables `set -x` in scripts)
debug: true

# required, must be set, ie 6.6.63-cloud-amd64
kernelVersion: ""

nodePool: ""

nodeSelector: {}
nodeAffinity: {}
tolerations:
Expand All @@ -20,10 +25,15 @@ tolerations:
- key: CriticalAddonsOnly
operator: Exists

kubectl:
imageRegistry: ""
image: bitnami/kubectl
version: latest

gardenlinux:
imageRegistry: ghcr.io
image: gardenlinux/gardenlinux/kmodbuild
version: amd64-1592.4
versionOverride: "" # ie amd64-1592.4

nvidia:
driverVersion: "550.127.08"
Expand All @@ -36,3 +46,10 @@ nvidia:
version: 1.0.25-gke.56
hostDevicePluginPath: /var/lib/kubelet/device-plugins
resources: {}

# mapping from the kernel version to the build image tag
kernelVersions:
6.6.63-cloud-amd64: amd64-1592.4
6.6.62-cloud-amd64: amd64-1592.3
6.6.71-cloud-amd64: amd64-1592.5
6.6.78-cloud-amd64: amd64-1592.6
34 changes: 34 additions & 0 deletions resources/test/ds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: test2
spec:
updateStrategy:
type: OnDelete
selector:
matchLabels:
name: test2
template:
metadata:
labels:
name: test2
spec:
priorityClassName: system-node-critical
hostPID: true
nodeSelector:
host: test2
containers:
- name: test2
image: ubuntu
imagePullPolicy: IfNotPresent
command:
- "/bin/bash"
- "-c"
- "--"
args:
- "echo $NAME && sleep 864000"
env:
- name: NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['name']
3 changes: 3 additions & 0 deletions resources/workload/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Dockerfile
README.md
__pycache__
21 changes: 21 additions & 0 deletions resources/workload/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04

# Install dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip python3-dev git && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Set Python3 as default
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1

WORKDIR /app
COPY requirements.txt /app/requirements.txt

# Install PyTorch and diffusers
RUN pip install --upgrade pip && \
pip install -r requirements.txt

COPY ./*.py /app

# Default command starts HTTP server
CMD ["python3", "main.py"]
13 changes: 13 additions & 0 deletions resources/workload/k8s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Pod
metadata:
name: test
spec:
containers:
- name: test
image: ghcr.io/tmilos77/gpu-test:cuda
imagePullPolicy: Always
resources:
limits:
nvidia.com/gpu: '1'
restartPolicy: Never
20 changes: 20 additions & 0 deletions resources/workload/log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

==========
== CUDA ==
==========

CUDA Version 11.8.0

Container image Copyright (c) 2016-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.

---

cuda_available: true
device_count: 1
device_name: NVIDIA L4
14 changes: 14 additions & 0 deletions resources/workload/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import torch
import yaml

data=dict()
is_available=torch.cuda.is_available()
data['cuda_available']=is_available

if is_available:
data['device_count']=torch.cuda.device_count()
data['device_name']=torch.cuda.get_device_name(0)

print('---')
print('')
print(yaml.dump(data))
3 changes: 3 additions & 0 deletions resources/workload/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
torch
pyyaml
numpy
5 changes: 5 additions & 0 deletions resources/workload/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

kubectl apply -f k8s.yaml

kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/busybox1
12 changes: 12 additions & 0 deletions tools/scripts/extract_build_image_kernels.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

IMAGE=ghcr.io/gardenlinux/gardenlinux/kmodbuild

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

TAGS=$(docker image ls $IMAGE | awk '{ print $2 }' | tail -n +2)

while IFS= read -r TAG; do
KERNEL=$(docker run --platform linux/amd64 --rm -v $SCRIPT_DIR/../../charts/gpu-driver/files/gardenlinux-nvidia-installer:/mnt/scripts $IMAGE:$TAG /mnt/scripts/extract_kernel_name.sh cloud)
echo " $KERNEL: $TAG"
done <<< "$TAGS"

0 comments on commit ad667fa

Please sign in to comment.