Skip to content

Commit

Permalink
Fix GPU E2E integ test (#437)
Browse files Browse the repository at this point in the history
  • Loading branch information
movence authored and musa-asad committed Dec 5, 2024
1 parent 834411d commit f0f0735
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 71 deletions.
9 changes: 5 additions & 4 deletions generator/resources/eks_addon_test_matrix.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
[
{
"k8s_version": "1.29",
"k8sVersion": "1.31",
"addon_name":"amazon-cloudwatch-observability",
"addon_version":"v1.6.0-eksbuild.1",
"ami_type": "AL2_x86_64_GPU",
"addon_version":"v2.5.0-eksbuild.1",
"ami": "AL2_x86_64_GPU",
"terraform_dir": "terraform/eks/addon/gpu",
"test_dir": "../../../../test/gpu"
"test_dir": "../../../../test/gpu",
"instanceType":"g4dn.xlarge"
}
]
4 changes: 2 additions & 2 deletions generator/resources/eks_daemon_test_matrix.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[
{
"k8s_version": "1.24",
"k8sVersion": "1.31",
"ami": "AL2_x86_64",
"instanceType":"t3.medium",
"arc": "amd64"
},
{
"k8s_version": "1.24",
"k8sVersion": "1.31",
"ami": "AL2_ARM_64",
"instanceType":"m6g.large",
"arc": "arm64"
Expand Down
2 changes: 1 addition & 1 deletion generator/resources/eks_deployment_test_matrix.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[
{
"k8sVersion": "1.24"
"k8sVersion": "1.31"
}
]
2 changes: 1 addition & 1 deletion generator/test_case_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ type matrixRow struct {
AgentStartCommand string `json:"agentStartCommand"`
CaCertPath string `json:"caCertPath"`
ValuesPerMinute int `json:"values_per_minute"` // Number of metrics to be sent or number of log lines to write
K8sVersion string `json:"k8s_version"`
K8sVersion string `json:"k8sVersion"`
TerraformDir string `json:"terraform_dir"`
UseSSM bool `json:"useSSM"`
ExcludedTests string `json:"excludedTests"`
Expand Down
17 changes: 7 additions & 10 deletions terraform/eks/addon/gpu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ locals {
}

resource "aws_eks_cluster" "this" {
name = "cwagent-operator-eks-integ-${module.common.testing_id}"
name = "cwagent-addon-eks-integ-${module.common.testing_id}"
role_arn = local.role_arn
version = var.k8s_version
enabled_cluster_log_types = [
Expand All @@ -40,17 +40,17 @@ resource "aws_eks_cluster" "this" {
# EKS Node Groups
resource "aws_eks_node_group" "this" {
cluster_name = aws_eks_cluster.this.name
node_group_name = "cwagent-operator-eks-integ-node"
node_group_name = "cwagent-addon-eks-integ-node"
node_role_arn = aws_iam_role.node_role.arn
subnet_ids = module.basic_components.public_subnet_ids

scaling_config {
desired_size = 2
max_size = 2
min_size = 2
desired_size = 1
max_size = 1
min_size = 1
}

ami_type = "AL2_x86_64_GPU"
ami_type = var.ami_type
capacity_type = "ON_DEMAND"
disk_size = 20
instance_types = [var.instance_type]
Expand All @@ -65,7 +65,7 @@ resource "aws_eks_node_group" "this" {

# EKS Node IAM Role
resource "aws_iam_role" "node_role" {
name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}"
name = "cwagent-addon-eks-Worker-Role-${module.common.testing_id}"

assume_role_policy = <<POLICY
{
Expand Down Expand Up @@ -129,6 +129,3 @@ resource "aws_eks_addon" "this" {
output "eks_cluster_name" {
value = aws_eks_cluster.this.name
}



4 changes: 2 additions & 2 deletions terraform/eks/addon/gpu/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ variable "addon_name" {

variable "addon_version" {
type = string
default = "v1.6.0-eksbuild.1"
default = "v2.5.0-eksbuild.1"
}

variable "k8s_version" {
type = string
default = "1.29"
default = "1.31"
}

variable "ami_type" {
Expand Down
63 changes: 12 additions & 51 deletions test/gpu/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ import (
"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
)

var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")

const (
gpuMetricIndicator = "_gpu_"

Expand All @@ -34,28 +32,26 @@ const (
podMemUtil = "pod_gpu_memory_utilization"
podLimit = "pod_gpu_limit"
podRequest = "pod_gpu_request"
podTotal = "pod_gpu_total"
podCountTotal = "pod_gpu_usage_total"
podReserved = "pod_gpu_reserved_capacity"
nodeMemTotal = "node_gpu_memory_total"
nodeMemUsed = "node_gpu_memory_used"
nodePower = "node_gpu_power_draw"
nodeTemp = "node_gpu_temperature"
nodeUtil = "node_gpu_utilization"
nodeMemUtil = "node_gpu_memory_utilization"

nodeCountTotal = "node_gpu_total"
nodeCountRequest = "node_gpu_request"
nodeCountLimit = "node_gpu_limit"
clusterCountTotal = "cluster_gpu_total"
clusterCountRequest = "cluster_gpu_request"
nodeCountTotal = "node_gpu_usage_total"
nodeCountLimit = "node_gpu_limit"
nodeReserved = "node_gpu_reserved_capacity"
)

var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")

var expectedDimsToMetricsIntegTest = map[string][]string{
"ClusterName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
//clusterCountTotal, clusterCountRequest,
},
"ClusterName-Namespace": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
Expand Down Expand Up @@ -90,45 +86,6 @@ var expectedDimsToMetricsIntegTest = map[string][]string{
},
}

var expectedDimsToMetricsE2E = map[string][]string{
"ClusterName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, podLimit, podTotal, podRequest, nodeCountTotal, nodeCountRequest, nodeCountLimit, clusterCountTotal, clusterCountRequest,
},
"ClusterName-Namespace": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
//"ClusterName-Namespace-Service": {
// podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
//},
"ClusterName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
"ClusterName-ContainerName-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-FullPodName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, podLimit, podTotal, podRequest,
},
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
},
"ClusterName-InstanceId-NodeName": {
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
},
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
},
}

type NvidiaTestRunner struct {
test_runner.BaseTestRunner
testName string
Expand All @@ -141,7 +98,11 @@ func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
var testResults []status.TestResult
expectedDimsToMetrics := expectedDimsToMetricsIntegTest
if *useE2EMetrics {
expectedDimsToMetrics = expectedDimsToMetricsE2E
// add GPU count metrics
expectedDimsToMetricsIntegTest["ClusterName"] = append(expectedDimsToMetricsIntegTest["ClusterName"], podReserved, podRequest, podCountTotal, podLimit, nodeCountTotal, nodeCountLimit, nodeReserved)
expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"] = append(expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"], nodeCountLimit, nodeCountTotal, nodeReserved)
}
testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
testResults = append(testResults, metric.ValidateLogs(t.env))
Expand Down
3 changes: 3 additions & 0 deletions test/metric/container_insights_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,14 @@ func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDim
dimsToMetrics := getMetricsInClusterDimension(env, metricFilter)
for dims, metrics := range expectedDimsToMetrics {
var actual map[string][][]types.Dimension
// find matching dim set from fetched and processed metric-dims groups
for _, dtm := range dimsToMetrics {
if dtm.dimStr == dims {
actual = dtm.metrics
break
}
}
// expected dim set doesn't exist
if len(actual) < 1 {
results = append(results, status.TestResult{
Name: dims,
Expand Down Expand Up @@ -142,6 +144,7 @@ func validateMetricsAvailability(dims string, expected []string, actual map[stri

func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool {
if len(expected) != len(actual) {
log.Printf("the count of fetched metrics do not match with expected count: expected-%v, actual-%v\n", len(expected), len(actual))
return false
}

Expand Down

0 comments on commit f0f0735

Please sign in to comment.