Skip to content

Commit

Permalink
gpu-driver chart 0.4.0 with multiple daemonsets
Browse files Browse the repository at this point in the history
  • Loading branch information
tmilos77 committed Feb 21, 2025
1 parent d0f7ef9 commit 461c0cd
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 52 deletions.
4 changes: 2 additions & 2 deletions charts/gpu-driver/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.0
version: 0.4.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.3.0"
appVersion: "0.4.0"
7 changes: 4 additions & 3 deletions charts/gpu-driver/files/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ fi
log() {
local NAME="kyma-gpu-driver"
local MSG=$1
local TS=$(date '+%Y-%m-%d %H:%M:%S')
local TS
TS=$(date '+%Y-%m-%d %H:%M:%S')
echo "${TS} ${NAME}: $MSG"
}

export KERNEL_NAME=$(./extract_kernel_name.sh ${KERNEL_TYPE} ${TARGET_ARCH})
KERNEL_NAME=$(./extract_kernel_name.sh ${KERNEL_TYPE} ${TARGET_ARCH})
export KERNEL_NAME

COMPILED_FILENAME="${DRIVER_VERSION}"

log "Compiling the GPU driver"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ mkdir -p /tmp/nvidia
pushd /tmp/nvidia

# Download Fabric Manager tarball
wget -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb
wget --retry-connrefused --waitretry=1 --read-timeout=30 --timeout=35 -t 10 -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb
apt-get update
apt-get install -V nvidia-fabricmanager-"$DRIVER_BRANCH"="$DRIVER_VERSION"-1

13 changes: 13 additions & 0 deletions charts/gpu-driver/files/node-labeler.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

set -euo pipefail

echo "Node name: $NODENAME"

KERNEL_VERSION=$(kubectl get node $NODENAME -o jsonpath='{.status.nodeInfo.kernelVersion}')

kubectl label node $NODENAME gpu.kyma-project.io/kernel-version=$KERNEL_VERSION --overwrite

echo "Labeled node $NODENAME with gpu.kyma-project.io/kernel-version=$KERNEL_VERSION"

sleep Infinity & wait
35 changes: 22 additions & 13 deletions charts/gpu-driver/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,6 @@
{{- end -}}


{{- define "gardenlinux.version" -}}
{{- if .Values.kernelVersion -}}
{{- if not (hasKey .Values.kernelVersions .Values.kernelVersion) }}{{ fail (printf "Unknown kernel version '%s'" .Values.kernelVersion) }}{{ end -}}
{{- get .Values.kernelVersions .Values.kernelVersion -}}
{{- else if .Values.gardenlinux.versionOverride -}}
{{- else -}}
{{- fail ".kernelVersion or .gardenlinux.versionOverride must be set" -}}
{{- end -}}
{{- end -}}


{{- define "image-pull-secrets" -}}
{{- with .Values.imagePullSecrets }}
{{- toYaml . | nindent 8 }}
Expand All @@ -36,13 +25,33 @@
{{- end }}
{{- end -}}

{{- define "node-selector" -}}
{{- $dict := .Values.nodeSelector }}
{{- define "node-selector.no-kernel" -}}
{{- $dict := merge .Values.nodeSelector dict }}
{{- if .Values.nodePool -}}
{{- $dict = set $dict "worker.gardener.cloud/pool" .Values.nodePool -}}
{{- end -}}
{{- $dict = unset $dict "gpu.kyma-project.io/kernel-version" -}}
{{- if $dict -}}
nodeSelector:
{{ toYaml $dict | indent 8 }}
{{- else }}
# no node selector
{{- end -}}
{{- end -}}


{{- define "node-selector.with-kernel" -}}
{{- $dict := merge .Values.nodeSelector dict }}
{{- if .Values.nodePool -}}
{{- $dict = set $dict "worker.gardener.cloud/pool" .Values.nodePool -}}
{{- end -}}
{{- if .Values.kernel -}}
{{- $dict = set $dict "gpu.kyma-project.io/kernel-version" .Values.kernel -}}
{{- end -}}
{{- if $dict -}}
nodeSelector:
{{ toYaml $dict | indent 8 }}
{{- else }}
# no node selector
{{- end -}}
{{- end -}}
7 changes: 2 additions & 5 deletions charts/gpu-driver/templates/device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
nodeAffinity:
{{ toYaml .Values.nodeAffinity | indent 10 }}
{{- end}}
priorityClassName: system-node-critical
priorityClassName: system-cluster-critical
volumes:
- name: device-plugin
hostPath:
Expand Down Expand Up @@ -59,10 +59,7 @@ spec:
mountPath: /dev
- mountPath: /usr/local/nvidia
name: nvidia
{{- if .Values.nodeSelector }}
nodeSelector:
{{ toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
{{ template "node-selector.no-kernel" $ }}
{{- if .Values.tolerations }}
tolerations:
{{ toYaml .Values.tolerations | indent 6 }}
Expand Down
61 changes: 38 additions & 23 deletions charts/gpu-driver/templates/driver-installer.yaml
Original file line number Diff line number Diff line change
@@ -1,45 +1,58 @@
{{- range $kernel, $tag := .Values.kernelVersions }}
{{- $safeKernel := ($kernel | replace "." "-") -}}
{{- $valuesOverlay := dict "kernel" $kernel "safeKernel" $safeKernel -}}
{{- $rootOverlay := dict "Values" $valuesOverlay -}}
{{- $root := merge $rootOverlay $ }}
---
#
# kernel: {{ $kernel }}
# safeKernel {{ $safeKernel }}
#
#
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ template "gpu-driver.name" . }}
namespace: {{ .Release.Namespace }}
name: {{ template "gpu-driver.name" $root }}-{{ $root.Values.safeKernel }}
namespace: {{ $root.Release.Namespace }}
labels:
gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }}
gpu.kyma-project.io/kernel-version: "{{ $root.Values.kernel }}"
gpu.kyma-project.io/build-image-version: "{{ $tag }}"
spec:
updateStrategy:
type: OnDelete
selector:
matchLabels:
name: {{ template "gpu-driver.name" . }}
name: {{ template "gpu-driver.name" $root }}
template:
metadata:
labels:
name: {{ template "gpu-driver.name" . }}
gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }}
name: {{ template "gpu-driver.name" $root }}
gpu.kyma-project.io/kernel-version: "{{ $root.Values.kernel }}"
gpu.kyma-project.io/build-image-version: "{{ $tag }}"
spec:
priorityClassName: system-node-critical
{{- if .Values.nodeAffinity }}
{{- if $root.Values.nodeAffinity }}
affinity:
nodeAffinity:
{{ toYaml .Values.nodeAffinity | indent 10 }}
{{ toYaml $root.Values.nodeAffinity | indent 10 }}
{{- end }}
hostPID: true
imagePullSecrets: {{ template "image-pull-secrets" . }}
imagePullSecrets: {{ template "image-pull-secrets" $ }}
containers:
- name: gpu-driver
{{- $imageUri := .Values.gardenlinux.image }}
{{- if .Values.gardenlinux.imageRegistry }}
{{ $imageUri = printf "%s/%s" .Values.gardenlinux.imageRegistry $imageUri }}
{{- $imageUri := $root.Values.gardenlinux.image }}
{{- if $root.Values.gardenlinux.imageRegistry }}
{{ $imageUri = printf "%s/%s" $root.Values.gardenlinux.imageRegistry $imageUri }}
{{- end }}
image: {{ $imageUri }}:{{ template "gardenlinux.version" . }}
image: {{ $imageUri }}:{{ $tag }}
workingDir: "/work"
command:
- "/work/entrypoint.sh"
securityContext:
privileged: true
{{- if .Values.nvidia.installer.resources }}
{{- if $root.Values.nvidia.installer.resources }}
resources:
{{ toYaml .Values.nvidia.installer.resources | indent 12 }}
{{ toYaml $root.Values.nvidia.installer.resources | indent 12 }}
{{- end }}
env:
- name: KERNEL_TYPE
Expand All @@ -51,9 +64,9 @@ spec:
- name: LD_ROOT
value: /root
- name: DEBUG
value: {{ .Values.debug | quote }}
value: {{ $root.Values.debug | quote }}
- name: HOST_DRIVER_PATH
value: {{ .Values.nvidia.installer.hostDriverPath }}
value: {{ $root.Values.nvidia.installer.hostDriverPath }}
volumeMounts:
- name: gpu-driver
mountPath: /work
Expand All @@ -64,16 +77,16 @@ spec:
- name: module-cache
mountPath: /opt/nvidia-installer/cache
- name: module-install-dir-base
mountPath: {{ required "nvidia.installer.hostDriverPath" .Values.nvidia.installer.hostDriverPath }}
{{ template "node-selector" . }}
{{- if .Values.tolerations }}
mountPath: {{ required "nvidia.installer.hostDriverPath" $root.Values.nvidia.installer.hostDriverPath }}
{{ template "node-selector.with-kernel" $root }}
{{- if $root.Values.tolerations }}
tolerations:
{{ toYaml .Values.tolerations | indent 6 }}
{{ toYaml $root.Values.tolerations | indent 6 }}
{{- end }}
volumes:
- name: gpu-driver
configMap:
name: {{ template "gpu-driver.name" . }}
name: {{ template "gpu-driver.name" $ }}
defaultMode: 0744
- name: dev
hostPath:
Expand All @@ -86,4 +99,6 @@ spec:
path: /opt/nvidia-installer/cache
- name: module-install-dir-base
hostPath:
path: {{ required "nvidia.installer.hostDriverPath" .Values.nvidia.installer.hostDriverPath }}
path: {{ required "nvidia.installer.hostDriverPath" $root.Values.nvidia.installer.hostDriverPath }}

{{- end }}
51 changes: 51 additions & 0 deletions charts/gpu-driver/templates/node-labeler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ template "node-labeler.name" . }}
namespace: {{ .Release.Namespace }}
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
name: {{ template "node-labeler.name" . }}
template:
metadata:
labels:
name: {{ template "node-labeler.name" . }}
spec:
priorityClassName: system-node-critical
imagePullSecrets: {{ template "image-pull-secrets"}}
containers:
- name: node-labeler
{{- $imageUri := .Values.kubectl.image }}
{{- if .Values.kubectl.imageRegistry }}
{{ $imageUri = printf "%s/%s" .Values.kubectl.imageRegistry $imageUri }}
{{- end }}
image: {{ $imageUri }}:{{ required "kubectl.version" .Values.kubectl.version }}
securityContext:
runAsUser: 65534
runAsGroup: 65534
runAsNonRoot: true
allowPrivilegeEscalation: false
capabilities:
drop:
- "ALL"
workingDir: "/work"
command:
- "/work/node-labeler.sh"
env:
- name: NODENAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: gpu-driver
mountPath: /work
serviceAccountName: {{ template "rbac.name" . }}
volumes:
- name: gpu-driver
configMap:
name: {{ template "gpu-driver.name" . }}
defaultMode: 0777

37 changes: 37 additions & 0 deletions charts/gpu-driver/templates/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "rbac.name" . }}
namespace: {{ .Release.Namespace }}

---

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ template "rbac.name" . }}
rules:
- apiGroups: [""]
resources:
- nodes
verbs:
- get
- list
- watch
- update
- patch

---

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ template "rbac.name" . }}
subjects:
- kind: ServiceAccount
name: {{ template "rbac.name" . }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ template "rbac.name" . }}
apiGroup: rbac.authorization.k8s.io
6 changes: 1 addition & 5 deletions charts/gpu-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ imagePullSecrets: []
# Set to true to output all shell commands (enables `set -x` in scripts)
debug: true

# required, must be set, ie 6.6.63-cloud-amd64
kernelVersion: ""

nodePool: ""

nodeSelector: {}
Expand All @@ -33,7 +30,6 @@ kubectl:
gardenlinux:
imageRegistry: ghcr.io
image: gardenlinux/gardenlinux/kmodbuild
versionOverride: "" # ie amd64-1592.4

nvidia:
driverVersion: "550.127.08"
Expand All @@ -49,7 +45,7 @@ nvidia:

# mapping from the kernel version to the build image tag
kernelVersions:
6.6.63-cloud-amd64: amd64-1592.4
6.6.62-cloud-amd64: amd64-1592.3
6.6.63-cloud-amd64: amd64-1592.4
6.6.71-cloud-amd64: amd64-1592.5
6.6.78-cloud-amd64: amd64-1592.6

0 comments on commit 461c0cd

Please sign in to comment.