Skip to content

Commit

Permalink
Test works locally needed to add some metrics to dims
Browse files Browse the repository at this point in the history
  • Loading branch information
Paramadon committed Jun 10, 2024
1 parent 5855c50 commit db41f67
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 25 deletions.
50 changes: 26 additions & 24 deletions test/gpu/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,28 @@ import (
const (
gpuMetricIndicator = "_gpu_"

containerMemTotal = "container_gpu_memory_total"
containerMemUsed = "container_gpu_memory_used"
containerPower = "container_gpu_power_draw"
containerTemp = "container_gpu_temperature"
containerUtil = "container_gpu_utilization"
containerMemUtil = "container_gpu_memory_utilization"
podMemTotal = "pod_gpu_memory_total"
podMemUsed = "pod_gpu_memory_used"
podPower = "pod_gpu_power_draw"
podTemp = "pod_gpu_temperature"
podUtil = "pod_gpu_utilization"
podMemUtil = "pod_gpu_memory_utilization"
nodeMemTotal = "node_gpu_memory_total"
nodeMemUsed = "node_gpu_memory_used"
nodePower = "node_gpu_power_draw"
nodeTemp = "node_gpu_temperature"
nodeUtil = "node_gpu_utilization"
nodeMemUtil = "node_gpu_memory_utilization"
containerMemTotal = "container_gpu_memory_total"
containerMemUsed = "container_gpu_memory_used"
containerPower = "container_gpu_power_draw"
containerTemp = "container_gpu_temperature"
containerUtil = "container_gpu_utilization"
containerMemUtil = "container_gpu_memory_utilization"
podMemTotal = "pod_gpu_memory_total"
podMemUsed = "pod_gpu_memory_used"
podPower = "pod_gpu_power_draw"
podTemp = "pod_gpu_temperature"
podUtil = "pod_gpu_utilization"
podMemUtil = "pod_gpu_memory_utilization"
podLimit = "pod_gpu_limit"
podRequest = "pod_gpu_request"
podTotal = "pod_gpu_total"
nodeMemTotal = "node_gpu_memory_total"
nodeMemUsed = "node_gpu_memory_used"
nodePower = "node_gpu_power_draw"
nodeTemp = "node_gpu_temperature"
nodeUtil = "node_gpu_utilization"
nodeMemUtil = "node_gpu_memory_utilization"

nodeCountTotal = "node_gpu_total"
nodeCountRequest = "node_gpu_request"
nodeCountLimit = "node_gpu_limit"
Expand All @@ -46,18 +50,16 @@ var expectedDimsToMetrics = map[string][]string{
"ClusterName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
//clusterCountTotal, clusterCountRequest,
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, podLimit, podTotal, podRequest, nodeCountTotal, nodeCountRequest, nodeCountLimit, clusterCountTotal, clusterCountRequest,
},
"ClusterName-Namespace": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
//"ClusterName-Namespace-Service": {
// podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
//},
"ClusterName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
"ClusterName-ContainerName-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
Expand All @@ -69,7 +71,7 @@ var expectedDimsToMetrics = map[string][]string{
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-FullPodName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
Expand Down
1 change: 1 addition & 0 deletions test/metric/container_insights_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim
continue
}
results = append(results, validateMetricsAvailability(dims, metrics, actual))

for _, m := range metrics {
// this is to prevent panic with rand.Intn when metrics are not yet ready in a cluster
if _, ok := actual[m]; !ok {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"required": [
"ClusterName",
"FullPodName",
"GpuDevice",
"InstanceId",
"Namespace",
"NodeName",
Expand Down

0 comments on commit db41f67

Please sign in to comment.