diff --git a/charts/nvidia-vgpu/config b/charts/nvidia-vgpu/config index 58979bd01..d71eb1168 100644 --- a/charts/nvidia-vgpu/config +++ b/charts/nvidia-vgpu/config @@ -4,7 +4,7 @@ export USE_OPENSOURCE_CHART=false export REPO_URL=https://project-hami.github.io/HAMi/ export REPO_NAME=hami export CHART_NAME=hami -export VERSION=2.4.0 +export VERSION=2.4.1 # pr, issue, none export UPGRADE_METHOD=pr diff --git a/charts/nvidia-vgpu/nvidia-vgpu/Chart.yaml b/charts/nvidia-vgpu/nvidia-vgpu/Chart.yaml index 3596496c9..40195c2d8 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/Chart.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 2.4.0 +appVersion: 2.4.1 description: Heterogeneous AI Computing Virtualization Middleware keywords: - vgpu @@ -8,9 +8,9 @@ kubeVersion: '>= 1.16.0' maintainers: [] name: nvidia-vgpu type: application -version: 2.4.0+1 +version: 2.4.1+1 dependencies: - name: hami - version: "2.4.0" + version: "2.4.1" repository: "https://project-hami.github.io/HAMi/" icon: data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNTAwIiBoZWlnaHQ9IjE4NDEiIHZpZXdCb3g9IjM1LjE4OCAzMS41MTIgMzUxLjQ2IDI1OC43ODUiPjx0aXRsZT5nZW5lcmF0ZWQgYnkgcHN0b2VkaXQgdmVyc2lvbjozLjQ0IGZyb20gTlZCYWRnZV8yRC5lcHM8L3RpdGxlPjxwYXRoIGQ9Ik0zODQuMTk1IDI4Mi4xMDljMCAzLjc3MS0yLjc2OSA2LjMwMi02LjA0NyA2LjMwMnYtLjAyM2MtMy4zNzEuMDIzLTYuMDg5LTIuNTA4LTYuMDg5LTYuMjc4IDAtMy43NjkgMi43MTgtNi4yOTMgNi4wODktNi4yOTMgMy4yNzktLjAwMSA2LjA0NyAyLjUyMyA2LjA0NyA2LjI5MnptMi40NTMgMGMwLTUuMTc2LTQuMDItOC4xOC04LjUtOC4xOC00LjUxMSAwLTguNTMxIDMuMDA0LTguNTMxIDguMTggMCA1LjE3MiA0LjAyMSA4LjE4OCA4LjUzMSA4LjE4OCA0LjQ4IDAgOC41LTMuMDE2IDguNS04LjE4OG0tOS45MS42OTJoLjkxbDIuMTA5IDMuNzAzaDIuMzE1bC0yLjMzNi0zLjg1OWMxLjIwNy0uMDg2IDIuMi0uNjYgMi4yLTIuMjg1IDAtMi4wMi0xLjM5My0yLjY2OC0zLjc1LTIuNjY4aC0zLjQxMXY4LjgxMmgxLjk2MWwuMDAyLTMuNzAzbTAtMS40OTJ2LTIuMTIxaDEuMzY0Yy43NDIgMCAxLjc1My4wNiAxLjc1My45NjUgMCAuOTg0LS41MjMgMS4xNTYtMS4zOTggMS4xNTZoLTEuNzE5TTMyOS40MDYgMjM3LjAyN2wxMC41OTggMjguOTkySDMxOC40OGwxMC45MjYtMjguOTkyem0tMTEuMzUtMTEuMjg5bC0yNC40MjMgNjEuODhoMTcuMjQ1bDMuODYzLTEwLjkzNWgyOC45MDNsMy42NTYgMTAuOTM1aDE4LjcyMmwtMjQuNjA1LTYxLjg4OC0yMy4zNjEuMDA4em0tNDkuMDMzIDYxLjkwM2gxNy40OTd2LTYxLjkyMmwtMTcuNS0uMDA0LjAwMyA2MS45MjZ6bS0xMjEuNDY3LTYxLjkyNmwtMTQuNTk4IDQ5LjA3OC0xMy45ODQtNDkuMDc0LTE4Ljg3OS0uMDA0IDE5Ljk3MiA2MS45MjZoMjUuMjA3bDIwLjEzMy02MS45MjZoLTE3Ljg1MXptNzAuNzI1IDEzLjQ4NGg3LjUyMWMxMC45MDkgMCAxNy45NjYgNC44OTggMTcuOTY2IDE3LjYwOSAwIDEyLjcxMy03LjA1NyAxNy42MTItMTcuOTY2IDE3LjYxMmgtNy41MjF2LTM1LjIyMXptLTE3LjM1LTEzLjQ4NHY2MS45MjZoMjguMzY1YzE1LjExMyAwIDIwLjA0OS0yLjUxMiAyNS4zODUtOC4xNDcgMy43NjktMy45NTcgNi4yMDctMTIuNjQyIDYuMjA3LTIyLjEzNCAwLTguNzA3LTIuMDYzLTE2LjQ2OS01LjY2LTIxLjMwNS02LjQ4LTguNjQ4LTE1LjgxNi0xMC4zNC0yOS43NS0xMC4zNGgtMjQuNTQ3em0tMTY1Ljc0My0uMDg2djYyLjAxMmgxNy42NDV2LTQ3LjA4NmwxMy42NzIuMDA0YzQuNTI3IDAgNy43NTQgMS4xMjkgOS45MzQgMy40NTcgMi43NjUgMi45NDUgMy44OTQgNy42OTkgMy44OTQgMTYuMzk2djI3LjIyOWgxNy4wOTh2LTM0LjI2MmMwLTI0LjQ1My0xNS41ODYtMjcuNzUtMzAuODM2LTI3Ljc1SDM1LjE4OHptMTM3LjU4My4wODZsLjAwNyA2MS45MjZoMTcuNDg5di02MS45MjZoLTE3LjQ5NnoiLz48cGF0aCBkPSJNODIuMjExIDEwMi40MTRzMjIuNTA0LTMzLjIwMyA2Ny40MzctMzYuNjM4VjUzLjczYy00OS43NjkgMy45OTctOTIuODY3IDQ2LjE0OS05Mi44NjcgNDYuMTQ5czI0LjQxIDcwLjU2NCA5Mi44NjcgNzcuMDI2di0xMi44MDRjLTUwLjIzNy02LjMyLTY3LjQzNy02MS42ODctNjcuNDM3LTYxLjY4N3ptNjcuNDM3IDM2LjIyM3YxMS43MjdjLTM3Ljk2OC02Ljc3LTQ4LjUwNy00Ni4yMzctNDguNTA3LTQ2LjIzN3MxOC4yMy0yMC4xOTUgNDguNTA3LTIzLjQ3djEyLjg2N2MtLjAyMyAwLS4wMzktLjAwNy0uMDU4LS4wMDctMTUuODkxLTEuOTA3LTI4LjMwNSAxMi45MzgtMjguMzA1IDEyLjkzOHM2Ljk1OCAyNC45OSAyOC4zNjMgMzIuMTgybTAtMTA3LjEyNVY1My43M2MxLjQ2MS0uMTEyIDIuOTIyLS4yMDcgNC4zOTEtLjI1NyA1Ni41ODItMS45MDcgOTMuNDQ5IDQ2LjQwNiA5My40NDkgNDYuNDA2cy00Mi4zNDMgNTEuNDg4LTg2LjQ1NyA1MS40ODhjLTQuMDQzIDAtNy44MjgtLjM3NS0xMS4zODMtMS4wMDV2MTMuNzM5YTc1LjA0IDc1LjA0IDAgMCAwIDkuNDgxLjYxMmM0MS4wNTEgMCA3MC43MzgtMjAuOTY1IDk5LjQ4NC00NS43NzggNC43NjYgMy44MTcgMjQuMjc4IDEzLjEwMyAyOC4yODkgMTcuMTY3LTI3LjMzMiAyMi44ODQtOTEuMDMxIDQxLjMzLTEyNy4xNDQgNDEuMzMtMy40ODEgMC02LjgyNC0uMjExLTEwLjExLS41Mjh2MTkuMzA2SDMwNS42OFYzMS41MTJIMTQ5LjY0OHptMCA0OS4xNDRWNjUuNzc3YzEuNDQ2LS4xMDEgMi45MDMtLjE3OSA0LjM5MS0uMjI2IDQwLjY4OC0xLjI3OCA2Ny4zODIgMzQuOTY1IDY3LjM4MiAzNC45NjVzLTI4LjgzMiA0MC4wNDItNTkuNzQ2IDQwLjA0MmMtNC40NDkgMC04LjQzOC0uNzE1LTEyLjAyOC0xLjkyMlY5My41MjNjMTUuODQgMS45MTQgMTkuMDI4IDguOTExIDI4LjU1MSAyNC43ODZsMjEuMTgxLTE3Ljg1OXMtMTUuNDYxLTIwLjI3Ny00MS41MjQtMjAuMjc3Yy0yLjgzNC0uMDAxLTUuNTQ1LjE5OC04LjIwNy40ODMiIGZpbGw9IiM3N2I5MDAiLz48L3N2Zz4= diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/Chart.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/Chart.yaml index f5e5a7050..e1b1e216c 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/Chart.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 2.4.0 +appVersion: 2.4.1 description: Heterogeneous AI Computing Virtualization Middleware keywords: - vgpu @@ -8,5 +8,5 @@ kubeVersion: '>= 1.16.0' maintainers: [] name: hami type: application -version: 2.4.0 +version: 2.4.1 icon: data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNTAwIiBoZWlnaHQ9IjE4NDEiIHZpZXdCb3g9IjM1LjE4OCAzMS41MTIgMzUxLjQ2IDI1OC43ODUiPjx0aXRsZT5nZW5lcmF0ZWQgYnkgcHN0b2VkaXQgdmVyc2lvbjozLjQ0IGZyb20gTlZCYWRnZV8yRC5lcHM8L3RpdGxlPjxwYXRoIGQ9Ik0zODQuMTk1IDI4Mi4xMDljMCAzLjc3MS0yLjc2OSA2LjMwMi02LjA0NyA2LjMwMnYtLjAyM2MtMy4zNzEuMDIzLTYuMDg5LTIuNTA4LTYuMDg5LTYuMjc4IDAtMy43NjkgMi43MTgtNi4yOTMgNi4wODktNi4yOTMgMy4yNzktLjAwMSA2LjA0NyAyLjUyMyA2LjA0NyA2LjI5MnptMi40NTMgMGMwLTUuMTc2LTQuMDItOC4xOC04LjUtOC4xOC00LjUxMSAwLTguNTMxIDMuMDA0LTguNTMxIDguMTggMCA1LjE3MiA0LjAyMSA4LjE4OCA4LjUzMSA4LjE4OCA0LjQ4IDAgOC41LTMuMDE2IDguNS04LjE4OG0tOS45MS42OTJoLjkxbDIuMTA5IDMuNzAzaDIuMzE1bC0yLjMzNi0zLjg1OWMxLjIwNy0uMDg2IDIuMi0uNjYgMi4yLTIuMjg1IDAtMi4wMi0xLjM5My0yLjY2OC0zLjc1LTIuNjY4aC0zLjQxMXY4LjgxMmgxLjk2MWwuMDAyLTMuNzAzbTAtMS40OTJ2LTIuMTIxaDEuMzY0Yy43NDIgMCAxLjc1My4wNiAxLjc1My45NjUgMCAuOTg0LS41MjMgMS4xNTYtMS4zOTggMS4xNTZoLTEuNzE5TTMyOS40MDYgMjM3LjAyN2wxMC41OTggMjguOTkySDMxOC40OGwxMC45MjYtMjguOTkyem0tMTEuMzUtMTEuMjg5bC0yNC40MjMgNjEuODhoMTcuMjQ1bDMuODYzLTEwLjkzNWgyOC45MDNsMy42NTYgMTAuOTM1aDE4LjcyMmwtMjQuNjA1LTYxLjg4OC0yMy4zNjEuMDA4em0tNDkuMDMzIDYxLjkwM2gxNy40OTd2LTYxLjkyMmwtMTcuNS0uMDA0LjAwMyA2MS45MjZ6bS0xMjEuNDY3LTYxLjkyNmwtMTQuNTk4IDQ5LjA3OC0xMy45ODQtNDkuMDc0LTE4Ljg3OS0uMDA0IDE5Ljk3MiA2MS45MjZoMjUuMjA3bDIwLjEzMy02MS45MjZoLTE3Ljg1MXptNzAuNzI1IDEzLjQ4NGg3LjUyMWMxMC45MDkgMCAxNy45NjYgNC44OTggMTcuOTY2IDE3LjYwOSAwIDEyLjcxMy03LjA1NyAxNy42MTItMTcuOTY2IDE3LjYxMmgtNy41MjF2LTM1LjIyMXptLTE3LjM1LTEzLjQ4NHY2MS45MjZoMjguMzY1YzE1LjExMyAwIDIwLjA0OS0yLjUxMiAyNS4zODUtOC4xNDcgMy43NjktMy45NTcgNi4yMDctMTIuNjQyIDYuMjA3LTIyLjEzNCAwLTguNzA3LTIuMDYzLTE2LjQ2OS01LjY2LTIxLjMwNS02LjQ4LTguNjQ4LTE1LjgxNi0xMC4zNC0yOS43NS0xMC4zNGgtMjQuNTQ3em0tMTY1Ljc0My0uMDg2djYyLjAxMmgxNy42NDV2LTQ3LjA4NmwxMy42NzIuMDA0YzQuNTI3IDAgNy43NTQgMS4xMjkgOS45MzQgMy40NTcgMi43NjUgMi45NDUgMy44OTQgNy42OTkgMy44OTQgMTYuMzk2djI3LjIyOWgxNy4wOTh2LTM0LjI2MmMwLTI0LjQ1My0xNS41ODYtMjcuNzUtMzAuODM2LTI3Ljc1SDM1LjE4OHptMTM3LjU4My4wODZsLjAwNyA2MS45MjZoMTcuNDg5di02MS45MjZoLTE3LjQ5NnoiLz48cGF0aCBkPSJNODIuMjExIDEwMi40MTRzMjIuNTA0LTMzLjIwMyA2Ny40MzctMzYuNjM4VjUzLjczYy00OS43NjkgMy45OTctOTIuODY3IDQ2LjE0OS05Mi44NjcgNDYuMTQ5czI0LjQxIDcwLjU2NCA5Mi44NjcgNzcuMDI2di0xMi44MDRjLTUwLjIzNy02LjMyLTY3LjQzNy02MS42ODctNjcuNDM3LTYxLjY4N3ptNjcuNDM3IDM2LjIyM3YxMS43MjdjLTM3Ljk2OC02Ljc3LTQ4LjUwNy00Ni4yMzctNDguNTA3LTQ2LjIzN3MxOC4yMy0yMC4xOTUgNDguNTA3LTIzLjQ3djEyLjg2N2MtLjAyMyAwLS4wMzktLjAwNy0uMDU4LS4wMDctMTUuODkxLTEuOTA3LTI4LjMwNSAxMi45MzgtMjguMzA1IDEyLjkzOHM2Ljk1OCAyNC45OSAyOC4zNjMgMzIuMTgybTAtMTA3LjEyNVY1My43M2MxLjQ2MS0uMTEyIDIuOTIyLS4yMDcgNC4zOTEtLjI1NyA1Ni41ODItMS45MDcgOTMuNDQ5IDQ2LjQwNiA5My40NDkgNDYuNDA2cy00Mi4zNDMgNTEuNDg4LTg2LjQ1NyA1MS40ODhjLTQuMDQzIDAtNy44MjgtLjM3NS0xMS4zODMtMS4wMDV2MTMuNzM5YTc1LjA0IDc1LjA0IDAgMCAwIDkuNDgxLjYxMmM0MS4wNTEgMCA3MC43MzgtMjAuOTY1IDk5LjQ4NC00NS43NzggNC43NjYgMy44MTcgMjQuMjc4IDEzLjEwMyAyOC4yODkgMTcuMTY3LTI3LjMzMiAyMi44ODQtOTEuMDMxIDQxLjMzLTEyNy4xNDQgNDEuMzMtMy40ODEgMC02LjgyNC0uMjExLTEwLjExLS41Mjh2MTkuMzA2SDMwNS42OFYzMS41MTJIMTQ5LjY0OHptMCA0OS4xNDRWNjUuNzc3YzEuNDQ2LS4xMDEgMi45MDMtLjE3OSA0LjM5MS0uMjI2IDQwLjY4OC0xLjI3OCA2Ny4zODIgMzQuOTY1IDY3LjM4MiAzNC45NjVzLTI4LjgzMiA0MC4wNDItNTkuNzQ2IDQwLjA0MmMtNC40NDkgMC04LjQzOC0uNzE1LTEyLjAyOC0xLjkyMlY5My41MjNjMTUuODQgMS45MTQgMTkuMDI4IDguOTExIDI4LjU1MSAyNC43ODZsMjEuMTgxLTE3Ljg1OXMtMTUuNDYxLTIwLjI3Ny00MS41MjQtMjAuMjc3Yy0yLjgzNC0uMDAxLTUuNTQ1LjE5OC04LjIwNy40ODMiIGZpbGw9IiM3N2I5MDAiLz48L3N2Zz4= diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/device-plugin/daemonsetnvidia.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/device-plugin/daemonsetnvidia.yaml index 042b8355b..e5d348855 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/device-plugin/daemonsetnvidia.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/device-plugin/daemonsetnvidia.yaml @@ -30,7 +30,6 @@ spec: runtimeClassName: {{ .Values.devicePlugin.runtimeClassName }} {{- end }} {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}} - # serviceAccountName: serviceAccountName: {{ include "hami-vgpu.device-plugin" . }} priorityClassName: system-node-critical hostPID: true @@ -45,11 +44,8 @@ spec: command: ["/bin/sh","-c", {{ printf "cp -f /k8s-vgpu/lib/nvidia/* %s/vgpu/" .Values.global.gpuHookPath | quote }}] command: - nvidia-device-plugin - - --resource-name={{ .Values.resourceName }} + - --config-file=/device-config.yaml - --mig-strategy={{ .Values.devicePlugin.migStrategy }} - - --device-memory-scaling={{ .Values.devicePlugin.deviceMemoryScaling }} - - --device-cores-scaling={{ .Values.devicePlugin.deviceCoreScaling }} - - --device-split-count={{ .Values.devicePlugin.deviceSplitCount }} - --disable-core-limit={{ .Values.devicePlugin.disablecorelimit }} {{- range .Values.devicePlugin.extraArgs }} - {{ . }} @@ -80,6 +76,9 @@ spec: mountPath: /config - name: hosttmp mountPath: /tmp + - name: device-config + mountPath: /device-config.yaml + subPath: device-config.yaml - name: vgpu-monitor image: "{{ .Values.devicePlugin.registry }}/{{ .Values.devicePlugin.repository }}:{{ .Values.version }}" imagePullPolicy: {{ .Values.devicePlugin.imagePullPolicy | quote }} @@ -138,6 +137,9 @@ spec: - name: deviceconfig configMap: name: {{ template "hami-vgpu.device-plugin" . }} + - name: device-config + configMap: + name: {{ include "hami-vgpu.scheduler" . }}-device {{- if .Values.devicePlugin.nvidianodeSelector }} nodeSelector: {{ toYaml .Values.devicePlugin.nvidianodeSelector | nindent 8 }} {{- end }} diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmap.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmap.yaml index f3380d1ce..b69ee15e3 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmap.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmap.yaml @@ -25,7 +25,15 @@ data: }, "managedResources": [ {{- if .Values.devices.ascend.enabled }} - {{- range .Values.devices.ascend.resources }} + {{- range .Values.devices.ascend.customresources }} + { + "name": "{{ . }}", + "ignoredByScheduler": true + }, + {{- end }} + {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.customresources }} { "name": "{{ . }}", "ignoredByScheduler": true diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmapnew.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmapnew.yaml index e9badb76f..acc507690 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmapnew.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/configmapnew.yaml @@ -8,7 +8,7 @@ metadata: {{- include "hami-vgpu.labels" . | nindent 4 }} data: config.yaml: | - {{- if gt (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 25}} + {{- if gt (.Capabilities.KubeVersion.Minor | int) 25}} apiVersion: kubescheduler.config.k8s.io/v1 {{- else }} apiVersion: kubescheduler.config.k8s.io/v1beta2 @@ -50,7 +50,13 @@ data: - name: {{ .Values.iluvatarResourceName }} ignoredByScheduler: true {{- if .Values.devices.ascend.enabled }} - {{- range .Values.devices.ascend.resources }} + {{- range .Values.devices.ascend.customresources }} + - name: {{ . }} + ignoredByScheduler: true + {{- end }} + {{- end }} + {{- if .Values.devices.mthreads.enabled }} + {{- range .Values.devices.mthreads.customresources }} - name: {{ . }} ignoredByScheduler: true {{- end }} diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/deployment.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/deployment.yaml index 61fd45cb5..2d23580e5 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/deployment.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/deployment.yaml @@ -33,11 +33,11 @@ spec: containers: {{- if .Values.scheduler.kubeScheduler.enabled }} - name: kube-scheduler - image: "{{ .Values.scheduler.kubeScheduler.registry }}/{{ .Values.scheduler.kubeScheduler.repository }}:{{ .Values.scheduler.kubeScheduler.imageTag }}" + image: {{ .Values.scheduler.kubeScheduler.image }}:{{ .Capabilities.KubeVersion.Version }} imagePullPolicy: {{ .Values.scheduler.kubeScheduler.imagePullPolicy | quote }} command: - kube-scheduler - {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22}} + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} {{- range .Values.scheduler.kubeScheduler.extraNewArgs }} - {{ . }} {{- end }} @@ -79,29 +79,15 @@ spec: {{- end }} command: - scheduler - - --resource-name={{ .Values.resourceName }} - - --resource-mem={{ .Values.resourceMem }} - - --resource-cores={{ .Values.resourceCores }} - - --resource-mem-percentage={{ .Values.resourceMemPercentage }} - - --resource-priority={{ .Values.resourcePriority }} - --http_bind=0.0.0.0:443 - --cert_file=/tls/tls.crt - --key_file=/tls/tls.key - --scheduler-name={{ .Values.schedulerName }} - --metrics-bind-address={{ .Values.scheduler.metricsBindAddress }} - - --default-mem={{ .Values.scheduler.defaultMem }} - - --default-gpu={{ .Values.scheduler.defaultGPUNum }} - - --default-cores={{ .Values.scheduler.defaultCores }} - - --iluvatar-memory={{ .Values.iluvatarResourceMem }} - - --iluvatar-cores={{ .Values.iluvatarResourceCore }} - - --cambricon-mlu-name={{ .Values.mluResourceName }} - - --cambricon-mlu-memory={{ .Values.mluResourceMem }} - - --cambricon-mlu-cores={{ .Values.mluResourceCores }} - - --overwrite-env={{ .Values.scheduler.overwriteEnv }} - --node-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy }} - --gpu-scheduler-policy={{ .Values.scheduler.defaultSchedulerPolicy.gpuSchedulerPolicy }} + - --device-config-file=/device-config.yaml {{- if .Values.devices.ascend.enabled }} - - --ascend-config-file=/ascend-config.yaml - --enable-ascend=true {{- end }} {{- if .Values.scheduler.nodeLabelSelector }} @@ -123,11 +109,9 @@ spec: volumeMounts: - name: tls-config mountPath: /tls - {{- if .Values.devices.ascend.enabled }} - name: device-config - mountPath: /ascend-config.yaml - subPath: ascend-config.yaml - {{- end }} + mountPath: /device-config.yaml + subPath: device-config.yaml {{- if .Values.scheduler.livenessProbe }} livenessProbe: httpGet: @@ -146,17 +130,15 @@ spec: {{- if .Values.scheduler.kubeScheduler.enabled }} - name: scheduler-config configMap: - {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} name: {{ template "hami-vgpu.scheduler" . }}-newversion {{- else }} name: {{ template "hami-vgpu.scheduler" . }} {{- end }} {{- end }} - {{- if .Values.devices.ascend.enabled }} - name: device-config configMap: name: {{ include "hami-vgpu.scheduler" . }}-device - {{- end }} {{- if .Values.scheduler.nodeSelector }} nodeSelector: {{ toYaml .Values.scheduler.nodeSelector | nindent 8 }} {{- end }} diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/device-configmap.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/device-configmap.yaml index 85afedea5..7e132e152 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/device-configmap.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/device-configmap.yaml @@ -1,4 +1,3 @@ -{{- if .Values.devices.ascend.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -7,10 +6,41 @@ metadata: app.kubernetes.io/component: hami-scheduler {{- include "hami-vgpu.labels" . | nindent 4 }} data: - ascend-config.yaml: |- - {{- if .Files.Glob "files/ascend-config.yaml" }} - {{- .Files.Get "files/ascend-config.yaml" | nindent 4}} + device-config.yaml: |- + {{- if .Files.Glob "files/device-config.yaml" }} + {{- .Files.Get "files/device-config.yaml" | nindent 4}} {{- else }} + nvidia: + resourceCountName: {{ .Values.resourceName }} + resourceMemoryName: {{ .Values.resourceMem }} + resourceMemoryPercentageName: {{ .Values.resourceMemPercentage }} + resourceCoreName: {{ .Values.resourceCores }} + resourcePriorityName: {{ .Values.resourcePriority }} + overwriteEnv: false + defaultMemory: 0 + defaultCores: 0 + defaultGPUNum: 1 + deviceSplitCount: 10 + deviceMemoryScaling: 1 + deviceCoreScaling: 1 + cambricon: + resourceCountName: {{ .Values.mluResourceName }} + resourceMemoryName: {{ .Values.mluResourceMem }} + resourceCoreName: {{ .Values.mluResourceCores }} + hygon: + resourceCountName: {{ .Values.dcuResourceName }} + resourceMemoryName: {{ .Values.dcuResourceMem }} + resourceCoreName: {{ .Values.dcuResourceCores }} + metax: + resourceCountName: "metax-tech.com/gpu" + mthreads: + resourceCountName: "mthreads.com/vgpu" + resourceMemoryName: "mthreads.com/sgpu-memory" + resourceCoreName: "mthreads.com/sgpu-core" + iluvatar: + resourceCountName: {{ .Values.iluvatarResourceName }} + resourceMemoryName: {{ .Values.iluvatarResourceMem }} + resourceCoreName: {{ .Values.iluvatarResourceCore }} vnpus: - chipName: 910B commonWord: Ascend910A @@ -71,5 +101,3 @@ data: aiCore: 4 aiCPU: 4 {{ end }} - -{{- end }} diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml index a44dc76d7..8470dc5ef 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-createSecret.yaml @@ -30,7 +30,7 @@ spec: {{- end }} containers: - name: create - {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} image: "{{ .Values.scheduler.patch.registry }}/{{ .Values.scheduler.patch.newRepository }}:{{ .Values.scheduler.patch.newTag }}" {{- else }} image: "{{ .Values.scheduler.patch.registry }}/{{ .Values.scheduler.patch.repository }}:{{ .Values.scheduler.patch.tag }}" @@ -40,8 +40,8 @@ spec: - create - --cert-name=tls.crt - --key-name=tls.key - {{- if .Values.scheduler.customWebhook.enabled }} - - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.customWebhook.host}} + {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} + - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}} {{- else }} - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }} {{- end }} diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml index 1f30f1433..398535b8c 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/job-patch/job-patchWebhook.yaml @@ -30,7 +30,7 @@ spec: {{- end }} containers: - name: patch - {{- if ge (.Values.scheduler.kubeScheduler.imageTag | substr 3 5| atoi) 22 }} + {{- if ge (.Capabilities.KubeVersion.Minor | int) 22 }} image: "{{ .Values.scheduler.patch.registry }}/{{ .Values.scheduler.patch.newRepository }}:{{ .Values.scheduler.patch.newTag }}" {{- else }} image: "{{ .Values.scheduler.patch.registry }}/{{ .Values.scheduler.patch.repository }}:{{ .Values.scheduler.patch.tag }}" diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/webhook.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/webhook.yaml index 314b9255c..d39f23673 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/webhook.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/templates/scheduler/webhook.yaml @@ -6,8 +6,8 @@ webhooks: - admissionReviewVersions: - v1beta1 clientConfig: - {{- if .Values.scheduler.customWebhook.enabled }} - url: https://{{ .Values.scheduler.customWebhook.host}}:{{.Values.scheduler.customWebhook.port}}{{.Values.scheduler.customWebhook.path}} + {{- if .Values.scheduler.admissionWebhook.customURL.enabled }} + url: https://{{ .Values.scheduler.admissionWebhook.customURL.host}}:{{.Values.scheduler.admissionWebhook.customURL.port}}{{.Values.scheduler.admissionWebhook.customURL.path}} {{- else }} service: name: {{ include "hami-vgpu.scheduler" . }} @@ -15,7 +15,7 @@ webhooks: path: /webhook port: {{ .Values.scheduler.service.httpPort }} {{- end }} - failurePolicy: {{ .Values.scheduler.mutatingWebhookConfiguration.failurePolicy }} + failurePolicy: {{ .Values.scheduler.admissionWebhook.failurePolicy }} matchPolicy: Equivalent name: vgpu.hami.io namespaceSelector: @@ -24,11 +24,11 @@ webhooks: operator: NotIn values: - ignore - {{- if .Values.scheduler.customWebhook.whitelistNamespaces }} + {{- if .Values.scheduler.admissionWebhook.whitelistNamespaces }} - key: kubernetes.io/metadata.name operator: NotIn values: - {{- toYaml .Values.scheduler.customWebhook.whitelistNamespaces | nindent 10 }} + {{- toYaml .Values.scheduler.admissionWebhook.whitelistNamespaces | nindent 10 }} {{- end }} objectSelector: matchExpressions: @@ -36,7 +36,7 @@ webhooks: operator: NotIn values: - ignore - reinvocationPolicy: Never + reinvocationPolicy: {{ .Values.scheduler.admissionWebhook.reinvocationPolicy }} rules: - apiGroups: - "" diff --git a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/values.yaml b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/values.yaml index dd2e174c0..6c80b40d7 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/values.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/charts/hami/values.yaml @@ -3,7 +3,7 @@ nameOverride: "" fullnameOverride: "" imagePullSecrets: [] -version: "v2.4.0" +version: "v2.4.1" #Nvidia GPU Parameters resourceName: "nvidia.com/vgpu" resourceMem: "nvidia.com/gpumem" @@ -37,11 +37,8 @@ scheduler: # if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default # scheduler pod from the cluster first, we must specify node name to skip the schedule workflow. nodeName: "" - # nodeLabelSelector: - # "gpu": "on" - defaultMem: 0 - defaultCores: 0 - defaultGPUNum: 1 + #nodeLabelSelector: + # "gpu": "on" overwriteEnv: "false" defaultSchedulerPolicy: nodeSchedulerPolicy: binpack @@ -52,7 +49,6 @@ scheduler: kubeScheduler: # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default. enabled: true - imageTag: "v1.28.0" registry: k8s-gcr.m.daocloud.io repository: kube-scheduler imagePullPolicy: IfNotPresent @@ -72,18 +68,21 @@ scheduler: podAnnotations: {} tolerations: [] #serviceAccountName: "hami-vgpu-scheduler-sa" - customWebhook: - enabled: false - # must be an endpoint using https. - # should generate host certs here - host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ - port: 31998 - path: /webhook + admissionWebhook: + customURL: + enabled: false + # must be an endpoint using https. + # should generate host certs here + host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ + port: 31998 + path: /webhook whitelistNamespaces: # Specify the namespaces that the webhook will not be applied to. # - default # - kube-system # - istio-system + reinvocationPolicy: Never + failurePolicy: Ignore patch: registry: docker.m.daocloud.io repository: jettech/kube-webhook-certgen @@ -96,8 +95,6 @@ scheduler: nodeSelector: {} tolerations: [] runAsUser: 2000 - mutatingWebhookConfiguration: - failurePolicy: Ignore service: httpPort: 443 schedulerPort: 31998 @@ -114,9 +111,6 @@ devicePlugin: monitorimage: "projecthami/hami" monitorctrPath: /usr/local/vgpu/containers imagePullPolicy: IfNotPresent - deviceSplitCount: 10 - deviceMemoryScaling: 1.0 - deviceCoreScaling: 1.0 runtimeClassName: "" migStrategy: "none" disablecorelimit: "false" @@ -133,7 +127,13 @@ devicePlugin: tolerations: [] hygonImageRepository: 4pdosc/vdcu-device-plugin hygonImageTag: v1.0 + deviceCoreScaling: 1.0 + deviceMemoryScaling: 1.0 devices: + mthreads: + enabled: false + customresources: + - mthreads.com/vgpu ascend: enabled: false image: "" @@ -142,7 +142,7 @@ devices: nodeSelector: ascend: "on" tolerations: [] - resources: + customresources: - huawei.com/Ascend910A - huawei.com/Ascend910A-memory - huawei.com/Ascend910B diff --git a/charts/nvidia-vgpu/nvidia-vgpu/values.yaml b/charts/nvidia-vgpu/nvidia-vgpu/values.yaml index 2561ae544..23dd68924 100644 --- a/charts/nvidia-vgpu/nvidia-vgpu/values.yaml +++ b/charts/nvidia-vgpu/nvidia-vgpu/values.yaml @@ -4,7 +4,7 @@ hami: nameOverride: "" fullnameOverride: "" imagePullSecrets: [] - version: "v2.4.0" + version: "v2.4.1" #Nvidia GPU Parameters resourceName: "nvidia.com/vgpu" resourceMem: "nvidia.com/gpumem" @@ -38,11 +38,8 @@ hami: # if we install the nvidia-vgpu-scheduler-scheduler as default scheduler, we need to remove the k8s default # scheduler pod from the cluster first, we must specify node name to skip the schedule workflow. nodeName: "" - # nodeLabelSelector: - # "gpu": "on" - defaultMem: 0 - defaultCores: 0 - defaultGPUNum: 1 + #nodeLabelSelector: + # "gpu": "on" overwriteEnv: "false" defaultSchedulerPolicy: nodeSchedulerPolicy: binpack @@ -53,7 +50,6 @@ hami: kubeScheduler: # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default. enabled: true - imageTag: "v1.28.0" registry: k8s-gcr.m.daocloud.io repository: kubernetes/kube-scheduler imagePullPolicy: IfNotPresent @@ -73,18 +69,21 @@ hami: podAnnotations: {} tolerations: [] #serviceAccountName: "hami-vgpu-scheduler-sa" - customWebhook: - enabled: false - # must be an endpoint using https. - # should generate host certs here - host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ - port: 31998 - path: /webhook + admissionWebhook: + customURL: + enabled: false + # must be an endpoint using https. + # should generate host certs here + host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://:/ + port: 31998 + path: /webhook whitelistNamespaces: # Specify the namespaces that the webhook will not be applied to. # - default # - kube-system # - istio-system + reinvocationPolicy: Never + failurePolicy: Ignore patch: registry: docker.m.daocloud.io repository: jettech/kube-webhook-certgen @@ -97,8 +96,6 @@ hami: nodeSelector: {} tolerations: [] runAsUser: 2000 - mutatingWebhookConfiguration: - failurePolicy: Ignore service: httpPort: 443 schedulerPort: 31998 @@ -115,9 +112,6 @@ hami: monitorimage: "projecthami/hami" monitorctrPath: /usr/local/vgpu/containers imagePullPolicy: IfNotPresent - deviceSplitCount: 10 - deviceMemoryScaling: 1.0 - deviceCoreScaling: 1.0 runtimeClassName: "" migStrategy: "none" disablecorelimit: "false" @@ -134,7 +128,13 @@ hami: tolerations: [] hygonImageRepository: 4pdosc/vdcu-device-plugin hygonImageTag: v1.0 + deviceCoreScaling: 1.0 + deviceMemoryScaling: 1.0 devices: + mthreads: + enabled: false + customresources: + - mthreads.com/vgpu ascend: enabled: false image: "" @@ -143,7 +143,7 @@ hami: nodeSelector: ascend: "on" tolerations: [] - resources: + customresources: - huawei.com/Ascend910A - huawei.com/Ascend910A-memory - huawei.com/Ascend910B