diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go index ced990b36..f51e73b64 100644 --- a/test/gpu/nvidia_test.go +++ b/test/gpu/nvidia_test.go @@ -17,24 +17,28 @@ import ( const ( gpuMetricIndicator = "_gpu_" - containerMemTotal = "container_gpu_memory_total" - containerMemUsed = "container_gpu_memory_used" - containerPower = "container_gpu_power_draw" - containerTemp = "container_gpu_temperature" - containerUtil = "container_gpu_utilization" - containerMemUtil = "container_gpu_memory_utilization" - podMemTotal = "pod_gpu_memory_total" - podMemUsed = "pod_gpu_memory_used" - podPower = "pod_gpu_power_draw" - podTemp = "pod_gpu_temperature" - podUtil = "pod_gpu_utilization" - podMemUtil = "pod_gpu_memory_utilization" - nodeMemTotal = "node_gpu_memory_total" - nodeMemUsed = "node_gpu_memory_used" - nodePower = "node_gpu_power_draw" - nodeTemp = "node_gpu_temperature" - nodeUtil = "node_gpu_utilization" - nodeMemUtil = "node_gpu_memory_utilization" + containerMemTotal = "container_gpu_memory_total" + containerMemUsed = "container_gpu_memory_used" + containerPower = "container_gpu_power_draw" + containerTemp = "container_gpu_temperature" + containerUtil = "container_gpu_utilization" + containerMemUtil = "container_gpu_memory_utilization" + podMemTotal = "pod_gpu_memory_total" + podMemUsed = "pod_gpu_memory_used" + podPower = "pod_gpu_power_draw" + podTemp = "pod_gpu_temperature" + podUtil = "pod_gpu_utilization" + podMemUtil = "pod_gpu_memory_utilization" + podLimit = "pod_gpu_limit" + podRequest = "pod_gpu_request" + podTotal = "pod_gpu_total" + nodeMemTotal = "node_gpu_memory_total" + nodeMemUsed = "node_gpu_memory_used" + nodePower = "node_gpu_power_draw" + nodeTemp = "node_gpu_temperature" + nodeUtil = "node_gpu_utilization" + nodeMemUtil = "node_gpu_memory_utilization" + nodeCountTotal = "node_gpu_total" nodeCountRequest = "node_gpu_request" nodeCountLimit = "node_gpu_limit" @@ -46,18 +50,16 @@ var expectedDimsToMetrics = map[string][]string{ "ClusterName": { containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, - nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, - //nodeCountTotal, nodeCountRequest, nodeCountLimit, - //clusterCountTotal, clusterCountRequest, + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, podLimit, podTotal, podRequest, nodeCountTotal, nodeCountRequest, nodeCountLimit, clusterCountTotal, clusterCountRequest, }, "ClusterName-Namespace": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, //"ClusterName-Namespace-Service": { // podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, //}, "ClusterName-Namespace-PodName": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, "ClusterName-ContainerName-Namespace-PodName": { containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, @@ -69,7 +71,7 @@ var expectedDimsToMetrics = map[string][]string{ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, }, "ClusterName-FullPodName-Namespace-PodName": { - podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest, }, "ClusterName-FullPodName-GpuDevice-Namespace-PodName": { podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index a3ddff03d..6c474339d 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -57,6 +57,7 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim continue } results = append(results, validateMetricsAvailability(dims, metrics, actual)) + for _, m := range metrics { // this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster if _, ok := actual[m]; !ok { diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json index 4b532094f..9e3124e3b 100644 --- a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json +++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json @@ -34,7 +34,6 @@ "required": [ "ClusterName", "FullPodName", - "GpuDevice", "InstanceId", "Namespace", "NodeName",