From c9b52c9e57faf51ca8d2ea8432e1340aa8149e51 Mon Sep 17 00:00:00 2001 From: Tigran Grigoryan Date: Sat, 19 Oct 2024 11:22:49 +0400 Subject: [PATCH] Add support for setting libcuda verbosity Resolves: #544 Signed-off-by: Tigran Grigoryan --- .../hami/templates/scheduler/deployment.yaml | 1 + charts/hami/values.yaml | 2 + pkg/device/nvidia/device.go | 44 ++++++++++++------- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/charts/hami/templates/scheduler/deployment.yaml b/charts/hami/templates/scheduler/deployment.yaml index 92b924deb..bd4bb8418 100644 --- a/charts/hami/templates/scheduler/deployment.yaml +++ b/charts/hami/templates/scheduler/deployment.yaml @@ -83,6 +83,7 @@ spec: - --resource-cores={{ .Values.resourceCores }} - --resource-mem-percentage={{ .Values.resourceMemPercentage }} - --resource-priority={{ .Values.resourcePriority }} + - --libcuda-log-verbosity-level={{ .Values.libcudaLogVerbosity }} - --http_bind=0.0.0.0:443 - --cert_file=/tls/tls.crt - --key_file=/tls/tls.key diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml index 97d6e1e02..b4478d2c7 100644 --- a/charts/hami/values.yaml +++ b/charts/hami/values.yaml @@ -11,6 +11,8 @@ resourceMem: "nvidia.com/gpumem" resourceMemPercentage: "nvidia.com/gpumem-percentage" resourceCores: "nvidia.com/gpucores" resourcePriority: "nvidia.com/priority" +## Use environment variable LIBCUDA_LOG_LEVEL to set the visibility of logs in containers +libcudaLogVerbosity: "2" #MLU Parameters mluResourceName: "cambricon.com/vmlu" diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go index 903f495cc..7146cdf55 100644 --- a/pkg/device/nvidia/device.go +++ b/pkg/device/nvidia/device.go @@ -49,13 +49,14 @@ const ( ) var ( - ResourceName string - ResourceMem string - ResourceCores string - ResourceMemPercentage string - ResourcePriority string - DebugMode bool - OverwriteEnv bool + ResourceName string + ResourceMem string + ResourceCores string + ResourceMemPercentage string + ResourcePriority string + LIBCUDALogVerbosityLevel string + DebugMode bool + OverwriteEnv bool ) type NvidiaGPUDevices struct { @@ -78,6 +79,7 @@ func ParseConfig(fs *flag.FlagSet) { fs.StringVar(&ResourceMemPercentage, "resource-mem-percentage", "nvidia.com/gpumem-percentage", "gpu memory fraction to allocate") fs.StringVar(&ResourceCores, "resource-cores", "nvidia.com/gpucores", "cores percentage to use") fs.StringVar(&ResourcePriority, "resource-priority", "vgputaskpriority", "vgpu task priority 0 for high and 1 for low") + fs.StringVar(&LIBCUDALogVerbosityLevel, "libcuda-log-verbosity-level", "2", "verbosity level of LIBCUDA") fs.BoolVar(&OverwriteEnv, "overwrite-env", false, "If set NVIDIA_VISIBLE_DEVICES=none to pods with no-gpu allocation") } @@ -136,18 +138,32 @@ func (dev *NvidiaGPUDevices) GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, e return nodedevices, nil } +func setOrUpdateEnvVar(ctr *corev1.Container, name string, value string) { + // Check if the env var already exists + for i, envVar := range ctr.Env { + if envVar.Name == name { + // If found, update the value + ctr.Env[i].Value = value + return + } + } + // If not found, append it as a new env var + ctr.Env = append(ctr.Env, corev1.EnvVar{ + Name: name, + Value: value, + }) +} + func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Pod) (bool, error) { /*gpu related */ priority, ok := ctr.Resources.Limits[corev1.ResourceName(ResourcePriority)] if ok { - ctr.Env = append(ctr.Env, corev1.EnvVar{ - Name: api.TaskPriority, - Value: fmt.Sprint(priority.Value()), - }) + setOrUpdateEnvVar(ctr, api.TaskPriority, fmt.Sprint(priority.Value())) } _, resourceNameOK := ctr.Resources.Limits[corev1.ResourceName(ResourceName)] if resourceNameOK { + setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel) return resourceNameOK, nil } @@ -159,14 +175,12 @@ func (dev *NvidiaGPUDevices) MutateAdmission(ctr *corev1.Container, p *corev1.Po if config.DefaultResourceNum > 0 { ctr.Resources.Limits[corev1.ResourceName(ResourceName)] = *resource.NewQuantity(int64(config.DefaultResourceNum), resource.BinarySI) resourceNameOK = true + setOrUpdateEnvVar(ctr, "LIBCUDA_LOG_LEVEL", LIBCUDALogVerbosityLevel) } } if !resourceNameOK && OverwriteEnv { - ctr.Env = append(ctr.Env, corev1.EnvVar{ - Name: "NVIDIA_VISIBLE_DEVICES", - Value: "none", - }) + setOrUpdateEnvVar(ctr, "NVIDIA_VISIBLE_DEVICES", "none") } return resourceNameOK, nil }