diff --git a/charts/gpu-driver/Chart.yaml b/charts/gpu-driver/Chart.yaml index caa356e..999ca51 100644 --- a/charts/gpu-driver/Chart.yaml +++ b/charts/gpu-driver/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.3.0 +version: 0.4.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.3.0" +appVersion: "0.4.0" diff --git a/charts/gpu-driver/files/entrypoint.sh b/charts/gpu-driver/files/entrypoint.sh index 1da00da..4088209 100644 --- a/charts/gpu-driver/files/entrypoint.sh +++ b/charts/gpu-driver/files/entrypoint.sh @@ -17,13 +17,14 @@ fi log() { local NAME="kyma-gpu-driver" local MSG=$1 - local TS=$(date '+%Y-%m-%d %H:%M:%S') + local TS + TS=$(date '+%Y-%m-%d %H:%M:%S') echo "${TS} ${NAME}: $MSG" } -export KERNEL_NAME=$(./extract_kernel_name.sh ${KERNEL_TYPE} ${TARGET_ARCH}) +KERNEL_NAME=$(./extract_kernel_name.sh ${KERNEL_TYPE} ${TARGET_ARCH}) +export KERNEL_NAME -COMPILED_FILENAME="${DRIVER_VERSION}" log "Compiling the GPU driver" diff --git a/charts/gpu-driver/files/gardenlinux-nvidia-installer/download_fabricmanager.sh b/charts/gpu-driver/files/gardenlinux-nvidia-installer/download_fabricmanager.sh index f54dd7c..cf1b837 100755 --- a/charts/gpu-driver/files/gardenlinux-nvidia-installer/download_fabricmanager.sh +++ b/charts/gpu-driver/files/gardenlinux-nvidia-installer/download_fabricmanager.sh @@ -21,7 +21,7 @@ mkdir -p /tmp/nvidia pushd /tmp/nvidia # Download Fabric Manager tarball -wget -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb +wget --retry-connrefused --waitretry=1 --read-timeout=30 --timeout=35 -t 10 -O /tmp/keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && dpkg -i /tmp/keyring.deb apt-get update apt-get install -V nvidia-fabricmanager-"$DRIVER_BRANCH"="$DRIVER_VERSION"-1 diff --git a/charts/gpu-driver/files/node-labeler.sh b/charts/gpu-driver/files/node-labeler.sh new file mode 100755 index 0000000..ff7b2b4 --- /dev/null +++ b/charts/gpu-driver/files/node-labeler.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -euo pipefail + +echo "Node name: $NODENAME" + +KERNEL_VERSION=$(kubectl get node $NODENAME -o jsonpath='{.status.nodeInfo.kernelVersion}') + +kubectl label node $NODENAME gpu.kyma-project.io/kernel-version=$KERNEL_VERSION --overwrite + +echo "Labeled node $NODENAME with gpu.kyma-project.io/kernel-version=$KERNEL_VERSION" + +sleep Infinity & wait diff --git a/charts/gpu-driver/templates/_helpers.tpl b/charts/gpu-driver/templates/_helpers.tpl index 2a64613..24d5401 100644 --- a/charts/gpu-driver/templates/_helpers.tpl +++ b/charts/gpu-driver/templates/_helpers.tpl @@ -16,17 +16,6 @@ {{- end -}} -{{- define "gardenlinux.version" -}} -{{- if .Values.kernelVersion -}} -{{- if not (hasKey .Values.kernelVersions .Values.kernelVersion) }}{{ fail (printf "Unknown kernel version '%s'" .Values.kernelVersion) }}{{ end -}} -{{- get .Values.kernelVersions .Values.kernelVersion -}} -{{- else if .Values.gardenlinux.versionOverride -}} -{{- else -}} -{{- fail ".kernelVersion or .gardenlinux.versionOverride must be set" -}} -{{- end -}} -{{- end -}} - - {{- define "image-pull-secrets" -}} {{- with .Values.imagePullSecrets }} {{- toYaml . | nindent 8 }} @@ -36,13 +25,33 @@ {{- end }} {{- end -}} -{{- define "node-selector" -}} -{{- $dict := .Values.nodeSelector }} +{{- define "node-selector.no-kernel" -}} +{{- $dict := merge .Values.nodeSelector dict }} {{- if .Values.nodePool -}} {{- $dict = set $dict "worker.gardener.cloud/pool" .Values.nodePool -}} {{- end -}} +{{- $dict = unset $dict "gpu.kyma-project.io/kernel-version" -}} +{{- if $dict -}} + nodeSelector: +{{ toYaml $dict | indent 8 }} +{{- else }} +# no node selector +{{- end -}} +{{- end -}} + + +{{- define "node-selector.with-kernel" -}} +{{- $dict := merge .Values.nodeSelector dict }} +{{- if .Values.nodePool -}} +{{- $dict = set $dict "worker.gardener.cloud/pool" .Values.nodePool -}} +{{- end -}} +{{- if .Values.kernel -}} +{{- $dict = set $dict "gpu.kyma-project.io/kernel-version" .Values.kernel -}} +{{- end -}} {{- if $dict -}} nodeSelector: {{ toYaml $dict | indent 8 }} +{{- else }} +# no node selector {{- end -}} {{- end -}} diff --git a/charts/gpu-driver/templates/device-plugin.yaml b/charts/gpu-driver/templates/device-plugin.yaml index 8249240..5dae830 100644 --- a/charts/gpu-driver/templates/device-plugin.yaml +++ b/charts/gpu-driver/templates/device-plugin.yaml @@ -23,7 +23,7 @@ spec: nodeAffinity: {{ toYaml .Values.nodeAffinity | indent 10 }} {{- end}} - priorityClassName: system-node-critical + priorityClassName: system-cluster-critical volumes: - name: device-plugin hostPath: @@ -59,10 +59,7 @@ spec: mountPath: /dev - mountPath: /usr/local/nvidia name: nvidia -{{- if .Values.nodeSelector }} - nodeSelector: -{{ toYaml .Values.nodeSelector | indent 8 }} -{{- end }} + {{ template "node-selector.no-kernel" $ }} {{- if .Values.tolerations }} tolerations: {{ toYaml .Values.tolerations | indent 6 }} diff --git a/charts/gpu-driver/templates/driver-installer.yaml b/charts/gpu-driver/templates/driver-installer.yaml index 244ed61..6c15758 100644 --- a/charts/gpu-driver/templates/driver-installer.yaml +++ b/charts/gpu-driver/templates/driver-installer.yaml @@ -1,45 +1,58 @@ +{{- range $kernel, $tag := .Values.kernelVersions }} +{{- $safeKernel := ($kernel | replace "." "-") -}} +{{- $valuesOverlay := dict "kernel" $kernel "safeKernel" $safeKernel -}} +{{- $rootOverlay := dict "Values" $valuesOverlay -}} +{{- $root := merge $rootOverlay $ }} +--- +# +# kernel: {{ $kernel }} +# safeKernel {{ $safeKernel }} +# +# apiVersion: apps/v1 kind: DaemonSet metadata: - name: {{ template "gpu-driver.name" . }} - namespace: {{ .Release.Namespace }} + name: {{ template "gpu-driver.name" $root }}-{{ $root.Values.safeKernel }} + namespace: {{ $root.Release.Namespace }} labels: - gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }} + gpu.kyma-project.io/kernel-version: "{{ $root.Values.kernel }}" + gpu.kyma-project.io/build-image-version: "{{ $tag }}" spec: updateStrategy: type: OnDelete selector: matchLabels: - name: {{ template "gpu-driver.name" . }} + name: {{ template "gpu-driver.name" $root }} template: metadata: labels: - name: {{ template "gpu-driver.name" . }} - gpu.kyma-project.io/kernel-version: {{ .Values.kernelVersion }} + name: {{ template "gpu-driver.name" $root }} + gpu.kyma-project.io/kernel-version: "{{ $root.Values.kernel }}" + gpu.kyma-project.io/build-image-version: "{{ $tag }}" spec: priorityClassName: system-node-critical - {{- if .Values.nodeAffinity }} + {{- if $root.Values.nodeAffinity }} affinity: nodeAffinity: -{{ toYaml .Values.nodeAffinity | indent 10 }} +{{ toYaml $root.Values.nodeAffinity | indent 10 }} {{- end }} hostPID: true - imagePullSecrets: {{ template "image-pull-secrets" . }} + imagePullSecrets: {{ template "image-pull-secrets" $ }} containers: - name: gpu-driver - {{- $imageUri := .Values.gardenlinux.image }} - {{- if .Values.gardenlinux.imageRegistry }} - {{ $imageUri = printf "%s/%s" .Values.gardenlinux.imageRegistry $imageUri }} + {{- $imageUri := $root.Values.gardenlinux.image }} + {{- if $root.Values.gardenlinux.imageRegistry }} + {{ $imageUri = printf "%s/%s" $root.Values.gardenlinux.imageRegistry $imageUri }} {{- end }} - image: {{ $imageUri }}:{{ template "gardenlinux.version" . }} + image: {{ $imageUri }}:{{ $tag }} workingDir: "/work" command: - "/work/entrypoint.sh" securityContext: privileged: true -{{- if .Values.nvidia.installer.resources }} +{{- if $root.Values.nvidia.installer.resources }} resources: -{{ toYaml .Values.nvidia.installer.resources | indent 12 }} +{{ toYaml $root.Values.nvidia.installer.resources | indent 12 }} {{- end }} env: - name: KERNEL_TYPE @@ -51,9 +64,9 @@ spec: - name: LD_ROOT value: /root - name: DEBUG - value: {{ .Values.debug | quote }} + value: {{ $root.Values.debug | quote }} - name: HOST_DRIVER_PATH - value: {{ .Values.nvidia.installer.hostDriverPath }} + value: {{ $root.Values.nvidia.installer.hostDriverPath }} volumeMounts: - name: gpu-driver mountPath: /work @@ -64,16 +77,16 @@ spec: - name: module-cache mountPath: /opt/nvidia-installer/cache - name: module-install-dir-base - mountPath: {{ required "nvidia.installer.hostDriverPath" .Values.nvidia.installer.hostDriverPath }} - {{ template "node-selector" . }} -{{- if .Values.tolerations }} + mountPath: {{ required "nvidia.installer.hostDriverPath" $root.Values.nvidia.installer.hostDriverPath }} + {{ template "node-selector.with-kernel" $root }} +{{- if $root.Values.tolerations }} tolerations: -{{ toYaml .Values.tolerations | indent 6 }} +{{ toYaml $root.Values.tolerations | indent 6 }} {{- end }} volumes: - name: gpu-driver configMap: - name: {{ template "gpu-driver.name" . }} + name: {{ template "gpu-driver.name" $ }} defaultMode: 0744 - name: dev hostPath: @@ -86,4 +99,6 @@ spec: path: /opt/nvidia-installer/cache - name: module-install-dir-base hostPath: - path: {{ required "nvidia.installer.hostDriverPath" .Values.nvidia.installer.hostDriverPath }} + path: {{ required "nvidia.installer.hostDriverPath" $root.Values.nvidia.installer.hostDriverPath }} + +{{- end }} diff --git a/charts/gpu-driver/templates/node-labeler.yaml b/charts/gpu-driver/templates/node-labeler.yaml new file mode 100644 index 0000000..afdec76 --- /dev/null +++ b/charts/gpu-driver/templates/node-labeler.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ template "node-labeler.name" . }} + namespace: {{ .Release.Namespace }} +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + name: {{ template "node-labeler.name" . }} + template: + metadata: + labels: + name: {{ template "node-labeler.name" . }} + spec: + priorityClassName: system-node-critical + imagePullSecrets: {{ template "image-pull-secrets"}} + containers: + - name: node-labeler + {{- $imageUri := .Values.kubectl.image }} + {{- if .Values.kubectl.imageRegistry }} + {{ $imageUri = printf "%s/%s" .Values.kubectl.imageRegistry $imageUri }} + {{- end }} + image: {{ $imageUri }}:{{ required "kubectl.version" .Values.kubectl.version }} + securityContext: + runAsUser: 65534 + runAsGroup: 65534 + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + workingDir: "/work" + command: + - "/work/node-labeler.sh" + env: + - name: NODENAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: gpu-driver + mountPath: /work + serviceAccountName: {{ template "rbac.name" . }} + volumes: + - name: gpu-driver + configMap: + name: {{ template "gpu-driver.name" . }} + defaultMode: 0777 + diff --git a/charts/gpu-driver/templates/rbac.yaml b/charts/gpu-driver/templates/rbac.yaml new file mode 100644 index 0000000..2465062 --- /dev/null +++ b/charts/gpu-driver/templates/rbac.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ template "rbac.name" . }} + namespace: {{ .Release.Namespace }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ template "rbac.name" . }} +rules: + - apiGroups: [""] + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ template "rbac.name" . }} +subjects: + - kind: ServiceAccount + name: {{ template "rbac.name" . }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: {{ template "rbac.name" . }} + apiGroup: rbac.authorization.k8s.io diff --git a/charts/gpu-driver/values.yaml b/charts/gpu-driver/values.yaml index eaf6251..e6a29fe 100644 --- a/charts/gpu-driver/values.yaml +++ b/charts/gpu-driver/values.yaml @@ -8,9 +8,6 @@ imagePullSecrets: [] # Set to true to output all shell commands (enables `set -x` in scripts) debug: true -# required, must be set, ie 6.6.63-cloud-amd64 -kernelVersion: "" - nodePool: "" nodeSelector: {} @@ -33,7 +30,6 @@ kubectl: gardenlinux: imageRegistry: ghcr.io image: gardenlinux/gardenlinux/kmodbuild - versionOverride: "" # ie amd64-1592.4 nvidia: driverVersion: "550.127.08" @@ -49,7 +45,7 @@ nvidia: # mapping from the kernel version to the build image tag kernelVersions: - 6.6.63-cloud-amd64: amd64-1592.4 6.6.62-cloud-amd64: amd64-1592.3 + 6.6.63-cloud-amd64: amd64-1592.4 6.6.71-cloud-amd64: amd64-1592.5 6.6.78-cloud-amd64: amd64-1592.6