From c638bfa49b51b75d909ceb1fe301b39703e9b99e Mon Sep 17 00:00:00 2001
From: Hyunsoo Kim <884273+movence@users.noreply.github.com>
Date: Mon, 8 Apr 2024 13:56:15 -0400
Subject: [PATCH] add nvidia gpu integ test (#399)

---
 environment/metadata.go                       |   5 +
 generator/test_case_generator.go              |   4 +
 terraform/eks/daemon/gpu/main.tf              | 719 ++++++++++++++++++
 terraform/eks/daemon/gpu/providers.tf         |  17 +
 terraform/eks/daemon/gpu/variables.tf         |  37 +
 test/gpu/gpu_test.go                          |  79 ++
 test/gpu/nvidia_test.go                       | 118 +++
 test/gpu/resources/config.json                |  16 +
 test/gpu/resources/httpd-ssl.conf             |  43 ++
 test/gpu/resources/httpd.conf                 | 101 +++
 test/metric/container_insights_util.go        | 222 ++++++
 test/metric/metric_list_query.go              |  30 +-
 test/metric/stat.go                           |   3 +-
 .../eks_daemonset_test.go                     | 231 ++----
 .../test_schemas/cluster_gpu.json             |  21 +
 .../test_schemas/container_gpu.json           |  45 ++
 .../eks_resources/test_schemas/node_gpu.json  |  44 ++
 .../eks_resources/test_schemas/pod_gpu.json   |  47 ++
 .../eks_resources/util.go                     |  12 +
 util/awsservice/cloudwatchmetrics.go          |   2 +-
 20 files changed, 1577 insertions(+), 219 deletions(-)
 create mode 100644 terraform/eks/daemon/gpu/main.tf
 create mode 100644 terraform/eks/daemon/gpu/providers.tf
 create mode 100644 terraform/eks/daemon/gpu/variables.tf
 create mode 100644 test/gpu/gpu_test.go
 create mode 100644 test/gpu/nvidia_test.go
 create mode 100644 test/gpu/resources/config.json
 create mode 100644 test/gpu/resources/httpd-ssl.conf
 create mode 100644 test/gpu/resources/httpd.conf
 create mode 100644 test/metric/container_insights_util.go
 create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
 create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
 create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
 create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json

diff --git a/environment/metadata.go b/environment/metadata.go
index c9c0162ab..7a0e1e965 100644
--- a/environment/metadata.go
+++ b/environment/metadata.go
@@ -43,6 +43,7 @@ type MetaData struct {
 	InstanceId                string
 	InstancePlatform          string
 	AgentStartCommand         string
+	EksGpuType                string
 }
 
 type MetaDataStrings struct {
@@ -65,6 +66,7 @@ type MetaDataStrings struct {
 	InstanceId                string
 	InstancePlatform          string
 	AgentStartCommand         string
+	EksGpuType                string
 }
 
 func registerComputeType(dataString *MetaDataStrings) {
@@ -94,6 +96,7 @@ func registerECSData(dataString *MetaDataStrings) {
 func registerEKSData(d *MetaDataStrings) {
 	flag.StringVar(&(d.EKSClusterName), "eksClusterName", "", "EKS cluster name")
 	flag.StringVar(&(d.EksDeploymentStrategy), "eksDeploymentStrategy", "", "Daemon/Replica/Sidecar")
+	flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/inferentia")
 }
 
 func registerPluginTestsToExecute(dataString *MetaDataStrings) {
@@ -210,6 +213,7 @@ func fillEKSData(e *MetaData, data *MetaDataStrings) {
 	}
 
 	e.EKSClusterName = data.EKSClusterName
+	e.EksGpuType = data.EksGpuType
 }
 func RegisterEnvironmentMetaDataFlags() *MetaDataStrings {
 	registerComputeType(registeredMetaDataStrings)
@@ -250,6 +254,7 @@ func GetEnvironmentMetaData() *MetaData {
 	metaDataStorage.InstanceId = registeredMetaDataStrings.InstanceId
 	metaDataStorage.InstancePlatform = registeredMetaDataStrings.InstancePlatform
 	metaDataStorage.AgentStartCommand = registeredMetaDataStrings.AgentStartCommand
+	metaDataStorage.EksGpuType = registeredMetaDataStrings.EksGpuType
 
 	return metaDataStorage
 }
diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go
index e17b63e0d..056670d43 100644
--- a/generator/test_case_generator.go
+++ b/generator/test_case_generator.go
@@ -214,6 +214,10 @@ var testTypeToTestConfig = map[string][]testConfig{
 			targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
 		},
 		{testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"},
+		{
+			testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
+			targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
+		},
 	},
 	"eks_deployment": {
 		{testDir: "./test/metric_value_benchmark"},
diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf
new file mode 100644
index 000000000..7952ef45a
--- /dev/null
+++ b/terraform/eks/daemon/gpu/main.tf
@@ -0,0 +1,719 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+  source             = "../../../common"
+  cwagent_image_repo = var.cwagent_image_repo
+  cwagent_image_tag  = var.cwagent_image_tag
+}
+
+module "basic_components" {
+  source = "../../../basic_components"
+
+  region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+  name     = "cwagent-eks-integ-${module.common.testing_id}"
+  role_arn = module.basic_components.role_arn
+  version  = var.k8s_version
+  enabled_cluster_log_types = [
+    "api",
+    "audit",
+    "authenticator",
+    "controllerManager",
+    "scheduler"
+  ]
+  vpc_config {
+    subnet_ids         = module.basic_components.public_subnet_ids
+    security_group_ids = [module.basic_components.security_group]
+  }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+  cluster_name    = aws_eks_cluster.this.name
+  node_group_name = "cwagent-eks-integ-node"
+  node_role_arn   = aws_iam_role.node_role.arn
+  subnet_ids      = module.basic_components.public_subnet_ids
+
+  scaling_config {
+    desired_size = 1
+    max_size     = 1
+    min_size     = 1
+  }
+
+  ami_type       = "AL2_x86_64"
+  capacity_type  = "ON_DEMAND"
+  disk_size      = 20
+  instance_types = ["t3.medium"]
+
+  depends_on = [
+    aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+    aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+    aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+    aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+  ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+  name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17",
+    Statement = [
+      {
+        Effect = "Allow",
+        Principal = {
+          Service = "ec2.amazonaws.com"
+        },
+        Action = "sts:AssumeRole"
+      }
+    ]
+  })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+  role       = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+  role       = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+  role       = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+  policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+  role       = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+  name        = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+  description = "Cluster communication with worker nodes"
+  vpc_id      = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+  description              = "Allow worker nodes to communicate with the cluster API Server"
+  from_port                = 443
+  protocol                 = "tcp"
+  security_group_id        = aws_security_group.eks_cluster_sg.id
+  source_security_group_id = aws_security_group.eks_nodes_sg.id
+  to_port                  = 443
+  type                     = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+  description              = "Allow cluster API Server to communicate with the worker nodes"
+  from_port                = 1024
+  protocol                 = "tcp"
+  security_group_id        = aws_security_group.eks_cluster_sg.id
+  source_security_group_id = aws_security_group.eks_nodes_sg.id
+  to_port                  = 65535
+  type                     = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+  name        = "cwagent-eks-node-sg-${module.common.testing_id}"
+  description = "Security group for all nodes in the cluster"
+  vpc_id      = module.basic_components.vpc_id
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+  }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+  description              = "Allow nodes to communicate with each other"
+  from_port                = 0
+  protocol                 = "-1"
+  security_group_id        = aws_security_group.eks_nodes_sg.id
+  source_security_group_id = aws_security_group.eks_nodes_sg.id
+  to_port                  = 65535
+  type                     = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+  description              = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+  from_port                = 1025
+  protocol                 = "tcp"
+  security_group_id        = aws_security_group.eks_nodes_sg.id
+  source_security_group_id = aws_security_group.eks_cluster_sg.id
+  to_port                  = 65535
+  type                     = "ingress"
+}
+
+
+# create cert for communication between agent and dcgm
+resource "tls_private_key" "private_key" {
+  algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+  content  = tls_private_key.private_key.private_key_pem
+  filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+  private_key_pem   = tls_private_key.private_key.private_key_pem
+  is_ca_certificate = true
+  subject {
+    common_name  = "dcgm-exporter-service.amazon-cloudwatch.svc"
+    organization = "Amazon CloudWatch Agent"
+  }
+  validity_period_hours = 24
+  allowed_uses = [
+    "digital_signature",
+    "key_encipherment",
+    "cert_signing",
+    "crl_signing",
+    "server_auth",
+    "client_auth",
+  ]
+}
+
+resource "local_file" "ca_cert_file" {
+  content  = tls_self_signed_cert.ca_cert.cert_pem
+  filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+  algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+  content  = tls_private_key.server_private_key.private_key_pem
+  filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+  private_key_pem = tls_private_key.server_private_key.private_key_pem
+  dns_names       = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"]
+  subject {
+    common_name  = "dcgm-exporter-service.amazon-cloudwatch.svc"
+    organization = "Amazon CloudWatch Agent"
+  }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+  cert_request_pem      = tls_cert_request.local_csr.cert_request_pem
+  ca_private_key_pem    = tls_private_key.private_key.private_key_pem
+  ca_cert_pem           = tls_self_signed_cert.ca_cert.cert_pem
+  validity_period_hours = 12
+  allowed_uses = [
+    "digital_signature",
+    "key_encipherment",
+    "server_auth",
+    "client_auth",
+  ]
+}
+
+resource "local_file" "server_cert_file" {
+  content  = tls_locally_signed_cert.server_cert.cert_pem
+  filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+  metadata {
+    name      = "amazon-cloudwatch-observability-agent-cert"
+    namespace = "amazon-cloudwatch"
+  }
+  data = {
+    "ca.crt"  = tls_self_signed_cert.ca_cert.cert_pem              #filebase64(local_file.ca_cert_file.filename)
+    "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem       #filebase64(local_file.server_cert_file.filename)
+    "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+  }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+  metadata {
+    name = "amazon-cloudwatch"
+  }
+}
+
+# dummy daemonset that simulates dcgm-exporter assuming there is only 1 GPU available
+resource "kubernetes_daemonset" "exporter" {
+  depends_on = [
+    kubernetes_namespace.namespace,
+    kubernetes_service_account.cwagentservice,
+    aws_eks_node_group.this,
+    kubernetes_config_map.httpdconfig,
+  ]
+  metadata {
+    name      = "dcgm-exporter"
+    namespace = "amazon-cloudwatch"
+    labels = {
+      k8s-app = "dcgm-exporter"
+    }
+  }
+  spec {
+    selector {
+      match_labels = {
+        "k8s-app" = "dcgm-exporter"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          "name" : "dcgm-exporter"
+          "k8s-app" : "dcgm-exporter"
+        }
+      }
+      spec {
+        node_selector = {
+          "kubernetes.io/os" : "linux"
+        }
+        container {
+          name  = "dcgm-exporter"
+          image = "httpd:2.4-alpine"
+          resources {
+            limits = {
+              "cpu" : "50m",
+              "memory" : "50Mi"
+            }
+            requests = {
+              "cpu" : "50m",
+              "memory" : "50Mi"
+            }
+          }
+          port {
+            name           = "metrics"
+            container_port = 9400
+            host_port      = 9400
+            protocol       = "TCP"
+          }
+          command = [
+            "/bin/sh",
+            "-c",
+          ]
+          args = [
+            "/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
+          ]
+          volume_mount {
+            mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert"
+            name       = "dcgmtls"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/usr/local/apache2/conf/httpd.conf"
+            sub_path   = "httpd.conf"
+            name       = "httpdconfig"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf"
+            sub_path   = "httpd-ssl.conf"
+            name       = "httpdconfig"
+            read_only  = true
+          }
+          env {
+            name = "HOST_IP"
+            value_from {
+              field_ref {
+                field_path = "status.hostIP"
+              }
+            }
+          }
+          env {
+            name = "HOST_NAME"
+            value_from {
+              field_ref {
+                field_path = "spec.nodeName"
+              }
+            }
+          }
+          env {
+            name = "K8S_NAMESPACE"
+            value_from {
+              field_ref {
+                field_path = "metadata.namespace"
+              }
+            }
+          }
+        }
+        volume {
+          name = "dcgmtls"
+          secret {
+            secret_name = "amazon-cloudwatch-observability-agent-cert"
+            items {
+              key  = "tls.crt"
+              path = "server.crt"
+            }
+            items {
+              key  = "tls.key"
+              path = "server.key"
+            }
+          }
+        }
+        volume {
+          name = "httpdconfig"
+          config_map {
+            name = "httpdconfig"
+          }
+        }
+        service_account_name             = "cloudwatch-agent"
+        termination_grace_period_seconds = 60
+      }
+    }
+  }
+}
+
+resource "kubernetes_service" "exporter" {
+  depends_on = [
+    kubernetes_namespace.namespace,
+    kubernetes_service_account.cwagentservice,
+    aws_eks_node_group.this,
+    kubernetes_daemonset.exporter
+  ]
+  metadata {
+    name      = "dcgm-exporter-service"
+    namespace = "amazon-cloudwatch"
+    labels = {
+      "k8s-app" : "dcgm-exporter-service"
+    }
+    annotations = {
+      "prometheus.io/scrape" : "true"
+    }
+  }
+  spec {
+    type = "ClusterIP"
+    selector = {
+      k8s-app = "dcgm-exporter"
+    }
+    port {
+      name        = "metrics"
+      port        = 9400
+      target_port = 9400
+      protocol    = "TCP"
+    }
+  }
+}
+
+resource "kubernetes_daemonset" "service" {
+  depends_on = [
+    kubernetes_namespace.namespace,
+    kubernetes_service_account.cwagentservice,
+    aws_eks_node_group.this,
+    kubernetes_service.exporter
+  ]
+  metadata {
+    name      = "cloudwatch-agent"
+    namespace = "amazon-cloudwatch"
+  }
+  spec {
+    selector {
+      match_labels = {
+        "name" : "cloudwatch-agent"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          "name" : "cloudwatch-agent"
+        }
+      }
+      spec {
+        node_selector = {
+          "kubernetes.io/os" : "linux"
+        }
+        container {
+          name              = "cwagent"
+          image             = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+          image_pull_policy = "Always"
+          resources {
+            limits = {
+              "cpu" : "200m",
+              "memory" : "200Mi"
+            }
+            requests = {
+              "cpu" : "200m",
+              "memory" : "200Mi"
+            }
+          }
+          port {
+            container_port = 25888
+            host_port      = 25888
+            protocol       = "UDP"
+          }
+          env {
+            name = "HOST_IP"
+            value_from {
+              field_ref {
+                field_path = "status.hostIP"
+              }
+            }
+          }
+          env {
+            name = "HOST_NAME"
+            value_from {
+              field_ref {
+                field_path = "spec.nodeName"
+              }
+            }
+          }
+          env {
+            name = "K8S_NAMESPACE"
+            value_from {
+              field_ref {
+                field_path = "metadata.namespace"
+              }
+            }
+          }
+          volume_mount {
+            mount_path = "/etc/cwagentconfig"
+            name       = "cwagentconfig"
+          }
+          volume_mount {
+            mount_path = "/rootfs"
+            name       = "rootfs"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/var/run/docker.sock"
+            name       = "dockersock"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/var/lib/docker"
+            name       = "varlibdocker"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/run/containerd/containerd.sock"
+            name       = "containerdsock"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/sys"
+            name       = "sys"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/dev/disk"
+            name       = "devdisk"
+            read_only  = true
+          }
+          volume_mount {
+            mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+            name       = "agenttls"
+            read_only  = true
+          }
+        }
+        volume {
+          name = "cwagentconfig"
+          config_map {
+            name = "cwagentconfig"
+          }
+        }
+        volume {
+          name = "rootfs"
+          host_path {
+            path = "/"
+          }
+        }
+        volume {
+          name = "dockersock"
+          host_path {
+            path = "/var/run/docker.sock"
+          }
+        }
+        volume {
+          name = "varlibdocker"
+          host_path {
+            path = "/var/lib/docker"
+          }
+        }
+        volume {
+          name = "containerdsock"
+          host_path {
+            path = "/run/containerd/containerd.sock"
+          }
+        }
+        volume {
+          name = "sys"
+          host_path {
+            path = "/sys"
+          }
+        }
+        volume {
+          name = "devdisk"
+          host_path {
+            path = "/dev/disk"
+          }
+        }
+        volume {
+          name = "agenttls"
+          secret {
+            secret_name = "amazon-cloudwatch-observability-agent-cert"
+            items {
+              key  = "ca.crt"
+              path = "tls-ca.crt"
+            }
+          }
+        }
+        service_account_name             = "cloudwatch-agent"
+        termination_grace_period_seconds = 60
+      }
+    }
+  }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+  httpd_config     = "../../../../${var.test_dir}/resources/httpd.conf"
+  httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+  cwagent_config   = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+  template = file(local.cwagent_config)
+  vars = {
+  }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+  depends_on = [
+    kubernetes_namespace.namespace,
+    kubernetes_service_account.cwagentservice
+  ]
+  metadata {
+    name      = "cwagentconfig"
+    namespace = "amazon-cloudwatch"
+  }
+  data = {
+    "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+  }
+}
+
+data "template_file" "httpd_config" {
+  template = file(local.httpd_config)
+  vars     = {}
+}
+data "template_file" "httpd_ssl_config" {
+  template = file(local.httpd_ssl_config)
+  vars     = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+  depends_on = [
+    kubernetes_namespace.namespace,
+    kubernetes_service_account.cwagentservice
+  ]
+  metadata {
+    name      = "httpdconfig"
+    namespace = "amazon-cloudwatch"
+  }
+  data = {
+    "httpd.conf" : data.template_file.httpd_config.rendered
+    "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+  }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+  depends_on = [kubernetes_namespace.namespace]
+  metadata {
+    name      = "cloudwatch-agent"
+    namespace = "amazon-cloudwatch"
+  }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+  depends_on = [kubernetes_namespace.namespace]
+  metadata {
+    name = "cloudwatch-agent-role"
+  }
+  rule {
+    verbs      = ["get", "list", "watch"]
+    resources  = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+    api_groups = [""]
+  }
+  rule {
+    verbs      = ["list", "watch"]
+    resources  = ["replicasets"]
+    api_groups = ["apps"]
+  }
+  rule {
+    verbs      = ["list", "watch"]
+    resources  = ["jobs"]
+    api_groups = ["batch"]
+  }
+  rule {
+    verbs      = ["get"]
+    resources  = ["nodes/proxy"]
+    api_groups = [""]
+  }
+  rule {
+    verbs      = ["create"]
+    resources  = ["nodes/stats", "configmaps", "events"]
+    api_groups = [""]
+  }
+  rule {
+    verbs          = ["get", "update"]
+    resource_names = ["cwagent-clusterleader"]
+    resources      = ["configmaps"]
+    api_groups     = [""]
+  }
+  rule {
+    verbs      = ["list", "watch"]
+    resources  = ["services"]
+    api_groups = [""]
+  }
+  rule {
+    non_resource_urls = ["/metrics"]
+    verbs             = ["get", "list", "watch"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+  depends_on = [kubernetes_namespace.namespace]
+  metadata {
+    name = "cloudwatch-agent-role-binding"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = "cloudwatch-agent-role"
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = "cloudwatch-agent"
+    namespace = "amazon-cloudwatch"
+  }
+}
+
+resource "null_resource" "validator" {
+  depends_on = [
+    aws_eks_node_group.this,
+    kubernetes_daemonset.service,
+    kubernetes_cluster_role_binding.rolebinding,
+    kubernetes_service_account.cwagentservice,
+  ]
+  provisioner "local-exec" {
+    command = <<-EOT
+      echo "Validating EKS metrics/logs for EMF"
+      cd ../../../..
+      go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia
+    EOT
+  }
+}
diff --git a/terraform/eks/daemon/gpu/providers.tf b/terraform/eks/daemon/gpu/providers.tf
new file mode 100644
index 000000000..9bd2885f5
--- /dev/null
+++ b/terraform/eks/daemon/gpu/providers.tf
@@ -0,0 +1,17 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+provider "aws" {
+  region = var.region
+}
+
+provider "kubernetes" {
+  exec {
+    api_version = "client.authentication.k8s.io/v1beta1"
+    command     = "aws"
+    args        = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
+  }
+  host                   = aws_eks_cluster.this.endpoint
+  cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf
new file mode 100644
index 000000000..26a0e6cd0
--- /dev/null
+++ b/terraform/eks/daemon/gpu/variables.tf
@@ -0,0 +1,37 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+variable "region" {
+  type    = string
+  default = "us-west-2"
+}
+
+variable "test_dir" {
+  type    = string
+  default = "./test/gpu"
+}
+
+variable "cwagent_image_repo" {
+  type    = string
+  default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
+}
+
+variable "cwagent_image_tag" {
+  type    = string
+  default = "latest"
+}
+
+variable "k8s_version" {
+  type    = string
+  default = "1.28"
+}
+
+variable "ami_type" {
+  type    = string
+  default = "AL2_x86_64"
+}
+
+variable "instance_type" {
+  type    = string
+  default = "g4dn.xlarge"
+}
\ No newline at end of file
diff --git a/test/gpu/gpu_test.go b/test/gpu/gpu_test.go
new file mode 100644
index 000000000..352f9cad1
--- /dev/null
+++ b/test/gpu/gpu_test.go
@@ -0,0 +1,79 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+	"fmt"
+	"log"
+	"testing"
+
+	"github.com/stretchr/testify/suite"
+
+	"github.com/aws/amazon-cloudwatch-agent-test/environment"
+	"github.com/aws/amazon-cloudwatch-agent-test/environment/computetype"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/status"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+type GPUTestSuite struct {
+	suite.Suite
+	test_runner.TestSuite
+}
+
+func (suite *GPUTestSuite) SetupSuite() {
+	fmt.Println(">>>> Starting GPU Container Insights TestSuite")
+}
+
+func (suite *GPUTestSuite) TearDownSuite() {
+	suite.Result.Print()
+	fmt.Println(">>>> Finished GPU Container Insights TestSuite")
+}
+
+func init() {
+	environment.RegisterEnvironmentMetaDataFlags()
+}
+
+var (
+	eksTestRunners []*test_runner.EKSTestRunner
+)
+
+func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner {
+	if eksTestRunners == nil {
+		factory := dimension.GetDimensionFactory(*env)
+
+		eksTestRunners = []*test_runner.EKSTestRunner{
+			{
+				Runner: &NvidiaTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_GPU_NVIDIA", env},
+				Env:    *env,
+			},
+		}
+	}
+	return eksTestRunners
+}
+
+func (suite *GPUTestSuite) TestAllInSuite() {
+	env := environment.GetEnvironmentMetaData()
+	switch env.ComputeType {
+	case computetype.EKS:
+		log.Println("Environment compute type is EKS")
+		for _, testRunner := range getEksTestRunners(env) {
+			testRunner.Run(suite, env)
+		}
+	default:
+		return
+	}
+
+	suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "GPU Container Test Suite Failed")
+}
+
+func (suite *GPUTestSuite) AddToSuiteResult(r status.TestGroupResult) {
+	suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r)
+}
+
+func TestGPUSuite(t *testing.T) {
+	suite.Run(t, new(GPUTestSuite))
+}
diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go
new file mode 100644
index 000000000..ced990b36
--- /dev/null
+++ b/test/gpu/nvidia_test.go
@@ -0,0 +1,118 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+	"time"
+
+	"github.com/aws/amazon-cloudwatch-agent-test/environment"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/status"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+const (
+	gpuMetricIndicator = "_gpu_"
+
+	containerMemTotal   = "container_gpu_memory_total"
+	containerMemUsed    = "container_gpu_memory_used"
+	containerPower      = "container_gpu_power_draw"
+	containerTemp       = "container_gpu_temperature"
+	containerUtil       = "container_gpu_utilization"
+	containerMemUtil    = "container_gpu_memory_utilization"
+	podMemTotal         = "pod_gpu_memory_total"
+	podMemUsed          = "pod_gpu_memory_used"
+	podPower            = "pod_gpu_power_draw"
+	podTemp             = "pod_gpu_temperature"
+	podUtil             = "pod_gpu_utilization"
+	podMemUtil          = "pod_gpu_memory_utilization"
+	nodeMemTotal        = "node_gpu_memory_total"
+	nodeMemUsed         = "node_gpu_memory_used"
+	nodePower           = "node_gpu_power_draw"
+	nodeTemp            = "node_gpu_temperature"
+	nodeUtil            = "node_gpu_utilization"
+	nodeMemUtil         = "node_gpu_memory_utilization"
+	nodeCountTotal      = "node_gpu_total"
+	nodeCountRequest    = "node_gpu_request"
+	nodeCountLimit      = "node_gpu_limit"
+	clusterCountTotal   = "cluster_gpu_total"
+	clusterCountRequest = "cluster_gpu_request"
+)
+
+var expectedDimsToMetrics = map[string][]string{
+	"ClusterName": {
+		containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+		podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+		nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+		//nodeCountTotal, nodeCountRequest, nodeCountLimit,
+		//clusterCountTotal, clusterCountRequest,
+	},
+	"ClusterName-Namespace": {
+		podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+	},
+	//"ClusterName-Namespace-Service": {
+	//	podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+	//},
+	"ClusterName-Namespace-PodName": {
+		podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+	},
+	"ClusterName-ContainerName-Namespace-PodName": {
+		containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+	},
+	"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
+		containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+	},
+	"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
+		containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+	},
+	"ClusterName-FullPodName-Namespace-PodName": {
+		podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+	},
+	"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
+		podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+	},
+	"ClusterName-InstanceId-NodeName": {
+		nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+		//nodeCountTotal, nodeCountRequest, nodeCountLimit,
+	},
+	"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
+		nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+	},
+}
+
+type NvidiaTestRunner struct {
+	test_runner.BaseTestRunner
+	testName string
+	env      *environment.MetaData
+}
+
+var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil)
+
+func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
+	var testResults []status.TestResult
+	testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
+	testResults = append(testResults, metric.ValidateLogs(t.env))
+	return status.TestGroupResult{
+		Name:        t.GetTestName(),
+		TestResults: testResults,
+	}
+}
+
+func (t *NvidiaTestRunner) GetTestName() string {
+	return t.testName
+}
+
+func (t *NvidiaTestRunner) GetAgentConfigFileName() string {
+	return ""
+}
+
+func (t *NvidiaTestRunner) GetAgentRunDuration() time.Duration {
+	return 3 * time.Minute
+}
+
+func (t *NvidiaTestRunner) GetMeasuredMetrics() []string {
+	return nil
+}
diff --git a/test/gpu/resources/config.json b/test/gpu/resources/config.json
new file mode 100644
index 000000000..6f37e43ed
--- /dev/null
+++ b/test/gpu/resources/config.json
@@ -0,0 +1,16 @@
+{
+  "agent": {
+    "metrics_collection_interval": 15,
+    "run_as_user": "root",
+    "debug": true,
+    "logfile": ""
+  },
+  "logs": {
+    "metrics_collected": {
+      "kubernetes": {
+        "enhanced_container_insights": true
+      }
+    },
+    "force_flush_interval": 5
+  }
+}
\ No newline at end of file
diff --git a/test/gpu/resources/httpd-ssl.conf b/test/gpu/resources/httpd-ssl.conf
new file mode 100644
index 000000000..8e441a2cd
--- /dev/null
+++ b/test/gpu/resources/httpd-ssl.conf
@@ -0,0 +1,43 @@
+Listen 9400
+
+SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+
+SSLHonorCipherOrder on
+
+SSLProtocol all -SSLv3
+SSLProxyProtocol all -SSLv3
+
+SSLPassPhraseDialog  builtin
+
+SSLSessionCache        "shmcb:/usr/local/apache2/logs/ssl_scache(512000)"
+SSLSessionCacheTimeout  300
+
+
+<VirtualHost _default_:9400>
+
+DocumentRoot "/usr/local/apache2/htdocs"
+ServerName dcgm-exporter-service.amazon-cloudwatch.svc:9400
+ServerAdmin you@example.com
+ErrorLog /proc/self/fd/2
+TransferLog /proc/self/fd/1
+
+SSLEngine on
+SSLCertificateFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.crt"
+SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.key"
+
+<FilesMatch "\.(cgi|shtml|phtml|php)$">
+    SSLOptions +StdEnvVars
+</FilesMatch>
+<Directory "/usr/local/apache2/cgi-bin">
+    SSLOptions +StdEnvVars
+</Directory>
+
+BrowserMatch "MSIE [2-5]" \
+         nokeepalive ssl-unclean-shutdown \
+         downgrade-1.0 force-response-1.0
+
+CustomLog /proc/self/fd/1 \
+          "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b"
+
+</VirtualHost>
\ No newline at end of file
diff --git a/test/gpu/resources/httpd.conf b/test/gpu/resources/httpd.conf
new file mode 100644
index 000000000..058db5063
--- /dev/null
+++ b/test/gpu/resources/httpd.conf
@@ -0,0 +1,101 @@
+
+ServerRoot "/usr/local/apache2"
+
+#Listen 9400
+
+LoadModule mpm_event_module modules/mod_mpm_event.so
+LoadModule authn_file_module modules/mod_authn_file.so
+LoadModule authn_core_module modules/mod_authn_core.so
+LoadModule authz_host_module modules/mod_authz_host.so
+LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+LoadModule authz_user_module modules/mod_authz_user.so
+LoadModule authz_core_module modules/mod_authz_core.so
+LoadModule access_compat_module modules/mod_access_compat.so
+LoadModule auth_basic_module modules/mod_auth_basic.so
+LoadModule socache_shmcb_module modules/mod_socache_shmcb.so
+LoadModule reqtimeout_module modules/mod_reqtimeout.so
+LoadModule filter_module modules/mod_filter.so
+LoadModule mime_module modules/mod_mime.so
+LoadModule log_config_module modules/mod_log_config.so
+LoadModule env_module modules/mod_env.so
+LoadModule headers_module modules/mod_headers.so
+LoadModule setenvif_module modules/mod_setenvif.so
+LoadModule version_module modules/mod_version.so
+LoadModule ssl_module modules/mod_ssl.so
+LoadModule unixd_module modules/mod_unixd.so
+LoadModule status_module modules/mod_status.so
+LoadModule autoindex_module modules/mod_autoindex.so
+LoadModule dir_module modules/mod_dir.so
+LoadModule alias_module modules/mod_alias.so
+
+<IfModule unixd_module>
+User www-data
+Group www-data
+</IfModule>
+
+<Directory />
+    AllowOverride none
+    Require all denied
+</Directory>
+
+DocumentRoot "/usr/local/apache2/htdocs"
+<Directory "/usr/local/apache2/htdocs">
+    Options Indexes FollowSymLinks
+    AllowOverride None
+    Require all granted
+</Directory>
+
+<IfModule dir_module>
+    DirectoryIndex index.html
+</IfModule>
+
+<Files ".ht*">
+    Require all denied
+</Files>
+
+ErrorLog /proc/self/fd/2
+
+LogLevel warn
+
+<IfModule log_config_module>
+    LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined
+    LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+    <IfModule logio_module>
+      # You need to enable mod_logio.c to use %I and %O
+      LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio
+    </IfModule>
+
+    CustomLog /proc/self/fd/1 common
+</IfModule>
+
+<IfModule alias_module>
+    ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/"
+</IfModule>
+
+<Directory "/usr/local/apache2/cgi-bin">
+    AllowOverride None
+    Options None
+    Require all granted
+</Directory>
+
+<IfModule headers_module>
+    RequestHeader unset Proxy early
+</IfModule>
+
+<IfModule mime_module>
+    TypesConfig conf/mime.types
+    AddType application/x-compress .Z
+    AddType application/x-gzip .gz .tgz
+</IfModule>
+
+<IfModule proxy_html_module>
+Include conf/extra/proxy-html.conf
+</IfModule>
+
+# Secure (SSL/TLS) connections
+Include conf/extra/httpd-ssl.conf
+<IfModule ssl_module>
+SSLRandomSeed startup builtin
+SSLRandomSeed connect builtin
+</IfModule>
\ No newline at end of file
diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go
new file mode 100644
index 000000000..140c839c1
--- /dev/null
+++ b/test/metric/container_insights_util.go
@@ -0,0 +1,222 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package metric
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"math/rand"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
+
+	"github.com/aws/amazon-cloudwatch-agent-test/environment"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/status"
+	"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
+)
+
+const (
+	dimDelimiter               = "-"
+	ContainerInsightsNamespace = "ContainerInsights"
+)
+
+type dimToMetrics struct {
+	// dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace
+	dimStr string
+	// metric names to their dimensions with values. Dimension sets will be used for metric data validations
+	metrics map[string][][]types.Dimension
+}
+
+func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult {
+	var results []status.TestResult
+	dimsToMetrics := getMetricsInClusterDimension(env, metricFilter)
+	for dims, metrics := range expectedDimsToMetrics {
+		var actual map[string][][]types.Dimension
+		for _, dtm := range dimsToMetrics {
+			if dtm.dimStr == dims {
+				actual = dtm.metrics
+				break
+			}
+		}
+		if len(actual) < 1 {
+			results = append(results, status.TestResult{
+				Name:   dims,
+				Status: status.FAILED,
+			})
+			log.Printf("ValidateMetrics failed with missing dimension set: %s", dims)
+			// keep testing other dims or fail early?
+			continue
+		}
+		results = append(results, validateMetricsAvailability(dims, metrics, actual))
+		for _, m := range metrics {
+			// pick a random dimension set to test metric data OR test all dimension sets which might be overkill
+			randIdx := rand.Intn(len(actual[m]))
+			results = append(results, validateMetricValue(m, actual[m][randIdx]))
+		}
+	}
+	return results
+}
+
+func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} {
+	listFetcher := Fetcher{}
+	log.Printf("Fetching by cluster dimension")
+	dims := []types.Dimension{
+		{
+			Name:  aws.String("ClusterName"),
+			Value: aws.String(env.EKSClusterName),
+		},
+	}
+	metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims)
+	if err != nil {
+		log.Println("failed to fetch metric list", err)
+		return nil
+	}
+	if len(metrics) < 1 {
+		log.Println("cloudwatch metric list is empty")
+		return nil
+	}
+
+	var results []dimToMetrics
+	for _, m := range metrics {
+		// filter by metric name filter
+		if metricFilter != "" && !strings.Contains(*m.MetricName, metricFilter) {
+			continue
+		}
+		var dims []string
+		for _, d := range m.Dimensions {
+			dims = append(dims, *d.Name)
+		}
+		sort.Sort(sort.StringSlice(dims))
+		dimsKey := strings.Join(dims, dimDelimiter)
+		log.Printf("processing dims: %s", dimsKey)
+
+		var dtm dimToMetrics
+		for _, ele := range results {
+			if ele.dimStr == dimsKey {
+				dtm = ele
+				break
+			}
+		}
+		if dtm.dimStr == "" {
+			dtm = dimToMetrics{
+				dimStr:  dimsKey,
+				metrics: make(map[string][][]types.Dimension),
+			}
+			results = append(results, dtm)
+		}
+		dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions)
+	}
+	return results
+}
+
+func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult {
+	testResult := status.TestResult{
+		Name:   dims,
+		Status: status.FAILED,
+	}
+	log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual))
+	if compareMetrics(expected, actual) {
+		testResult.Status = status.SUCCESSFUL
+	} else {
+		log.Printf("validateMetricsAvailability failed for %s", dims)
+	}
+	return testResult
+}
+
+func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool {
+	if len(expected) != len(actual) {
+		return false
+	}
+
+	for _, key := range expected {
+		if _, ok := actual[key]; !ok {
+			return false
+		}
+	}
+	return true
+}
+
+func validateMetricValue(name string, dims []types.Dimension) status.TestResult {
+	log.Printf("validateMetricValue with metric: %s", name)
+	testResult := status.TestResult{
+		Name:   name,
+		Status: status.FAILED,
+	}
+	valueFetcher := MetricValueFetcher{}
+	values, err := valueFetcher.Fetch(ContainerInsightsNamespace, name, dims, SAMPLE_COUNT, MinuteStatPeriod)
+	if err != nil {
+		log.Println("failed to fetch metrics", err)
+		return testResult
+	}
+
+	if !IsAllValuesGreaterThanOrEqualToExpectedValue(name, values, 0) {
+		return testResult
+	}
+
+	testResult.Status = status.SUCCESSFUL
+	return testResult
+}
+
+func ValidateLogs(env *environment.MetaData) status.TestResult {
+	testResult := status.TestResult{
+		Name:   "emf-logs",
+		Status: status.FAILED,
+	}
+
+	end := time.Now()
+	start := end.Add(time.Duration(-3) * time.Minute)
+	group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
+
+	// need to get the instances used for the EKS cluster
+	eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
+	if err != nil {
+		log.Println("failed to get EKS instances", err)
+		return testResult
+	}
+
+	for _, instance := range eKSInstances {
+		stream := *instance.InstanceName
+		err = awsservice.ValidateLogs(
+			group,
+			stream,
+			&start,
+			&end,
+			awsservice.AssertLogsNotEmpty(),
+			//awsservice.AssertNoDuplicateLogs(),
+			awsservice.AssertPerLog(
+				awsservice.AssertLogSchema(func(message string) (string, error) {
+					var eksClusterType awsservice.EKSClusterType
+					innerErr := json.Unmarshal([]byte(message), &eksClusterType)
+					if innerErr != nil {
+						return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr)
+					}
+
+					//log.Printf("eksClusterType is: %s", eksClusterType.Type)
+					jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type]
+					if !ok {
+						return "", errors.New("invalid cluster type provided")
+					}
+					return jsonSchema, nil
+				}),
+				awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)),
+			),
+		)
+
+		if err != nil {
+			log.Printf("log validation (%s/%s) failed: %v", group, stream, err)
+			return testResult
+		}
+	}
+
+	testResult.Status = status.SUCCESSFUL
+	return testResult
+}
diff --git a/test/metric/metric_list_query.go b/test/metric/metric_list_query.go
index 8a3804efe..afe07f6c1 100644
--- a/test/metric/metric_list_query.go
+++ b/test/metric/metric_list_query.go
@@ -17,10 +17,10 @@ import (
 	"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
 )
 
-type MetricListFetcher struct {
+type Fetcher struct {
 }
 
-func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) {
+func (n *Fetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) {
 	var dims []types.DimensionFilter
 	for _, dim := range dimensions {
 		dims = append(dims, types.DimensionFilter{
@@ -56,29 +56,3 @@ func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []typ
 	log.Printf("total number of metrics fetched: %v", len(metrics))
 	return metrics, nil
 }
-
-func (n *MetricListFetcher) FetchByDimension(namespace string, dimensions []types.Dimension) ([]types.Metric, error) {
-	var dims []types.DimensionFilter
-	for _, dim := range dimensions {
-		dims = append(dims, types.DimensionFilter{
-			Name:  dim.Name,
-			Value: dim.Value,
-		})
-	}
-
-	listMetricInput := cloudwatch.ListMetricsInput{
-		Namespace:  aws.String(namespace),
-		Dimensions: dims,
-	}
-
-	log.Printf("Metric data input: namespace %v, dimensions %v", namespace, fmt.Sprint(&dims))
-
-	output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput)
-	if err != nil {
-		return nil, fmt.Errorf("Error getting metric data %v", err)
-	}
-
-	log.Printf("Metrics fetched : %v", output.Metrics)
-
-	return output.Metrics, nil
-}
diff --git a/test/metric/stat.go b/test/metric/stat.go
index d633985d3..a6dc5d25e 100644
--- a/test/metric/stat.go
+++ b/test/metric/stat.go
@@ -13,6 +13,5 @@ const (
 	MAXUMUM                  Statistics = "Maxmimum"
 	SUM                      Statistics = "Sum"
 	HighResolutionStatPeriod            = 10
-
-	MinuteStatPeriod = 60
+	MinuteStatPeriod                    = 60
 )
diff --git a/test/metric_value_benchmark/eks_daemonset_test.go b/test/metric_value_benchmark/eks_daemonset_test.go
index 9572bd8bd..d7eb996f2 100644
--- a/test/metric_value_benchmark/eks_daemonset_test.go
+++ b/test/metric_value_benchmark/eks_daemonset_test.go
@@ -6,28 +6,22 @@
 package metric_value_benchmark
 
 import (
-	"encoding/json"
-	"errors"
-	"fmt"
 	"log"
-	"math/rand"
-	"sort"
-	"strings"
 	"time"
 
-	"github.com/aws/aws-sdk-go-v2/aws"
 	"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
+	"golang.org/x/exp/slices"
 
 	"github.com/aws/amazon-cloudwatch-agent-test/environment"
 	"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+	"github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
 	"github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources"
 	"github.com/aws/amazon-cloudwatch-agent-test/test/status"
 	"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
-	"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
 )
 
-const containerInsightsNamespace = "ContainerInsights"
-const gpuMetricIndicator = "_gpu_"
+// list of metrics with more dimensions e.g. PodName and Namespace
+var metricsWithMoreDimensions = []string{"pod_number_of_container_restarts"}
 
 type EKSDaemonTestRunner struct {
 	test_runner.BaseTestRunner
@@ -37,151 +31,66 @@ type EKSDaemonTestRunner struct {
 
 func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult {
 	var testResults []status.TestResult
-	testResults = append(testResults, validateMetrics(e.env, gpuMetricIndicator, eks_resources.GetExpectedDimsToMetrics(e.env))...)
-	testResults = append(testResults, e.validateLogs(e.env))
+	testResults = append(testResults, metric.ValidateMetrics(e.env, "", eks_resources.GetExpectedDimsToMetrics(e.env))...)
+	metrics := e.GetMeasuredMetrics()
+	for _, name := range metrics {
+		testResults = append(testResults, e.validateInstanceMetrics(name))
+	}
+	testResults = append(testResults, metric.ValidateLogs(e.env))
 	return status.TestGroupResult{
 		Name:        e.GetTestName(),
 		TestResults: testResults,
 	}
 }
 
-const (
-	dimDelimiter               = "-"
-	ContainerInsightsNamespace = "ContainerInsights"
-)
-
-type dimToMetrics struct {
-	// dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace
-	dimStr string
-	// metric names to their dimensions with values. Dimension sets will be used for metric data validations
-	metrics map[string][][]types.Dimension
-}
-
-func validateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult {
-	var results []status.TestResult
-	dimsToMetrics := getMetricsInClusterDimension(env, metricFilter)
-	//loops through each dimension set and checks if they exit in the cluster(fails if it doesn't)
-	for dims, metrics := range expectedDimsToMetrics {
-		var actual map[string][][]types.Dimension
-		//looping through dtms until we find the dimension string equal to the one in the hard coded map
-		for _, dtm := range dimsToMetrics {
-			log.Printf("dtm: %s vs dims %s", dtm.dimStr, dims) //testing purposes
-			if dtm.dimStr == dims {
-				actual = dtm.metrics
-				break
-			}
-		}
-		//if there are no metrics for the dimension set, we fail the test
-		if len(actual) < 1 {
-			results = append(results, status.TestResult{
-				Name:   dims,
-				Status: status.FAILED,
-			})
-			log.Printf("ValidateMetrics failed with missing dimension set: %s", dims)
-			// keep testing other dims or fail early?
-			continue
-		}
-		//verifies length of metrics for dimension set
-		results = append(results, validateMetricsAvailability(dims, metrics, actual))
-		for _, m := range metrics {
-			// picking a random dimension set to test metric data so we don't have to test every dimension set
-			randIdx := rand.Intn(len(actual[m]))
-			//verifys values of metrics
-			results = append(results, validateMetricValue(m, actual[m][randIdx]))
-		}
+func (e *EKSDaemonTestRunner) validateInstanceMetrics(name string) status.TestResult {
+	testResult := status.TestResult{
+		Name:   name,
+		Status: status.FAILED,
 	}
-	return results
-}
 
-// Fetches all metrics in cluster
-func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} {
-	listFetcher := metric.MetricListFetcher{}
-	log.Printf("Fetching by cluster dimension")
-	dims := []types.Dimension{
+	dims, failed := e.DimensionFactory.GetDimensions([]dimension.Instruction{
 		{
-			Name:  aws.String("ClusterName"),
-			Value: aws.String(env.EKSClusterName),
+			Key:   "ClusterName",
+			Value: dimension.UnknownDimensionValue(),
 		},
-	}
-	metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims)
-	if err != nil {
-		log.Println("failed to fetch metric list", err)
-		return nil
-	}
-	if len(metrics) < 1 {
-		log.Println("cloudwatch metric list is empty")
-		return nil
+	})
+	if len(failed) > 0 {
+		log.Println("failed to get dimensions")
+		return testResult
 	}
 
-	var results []dimToMetrics
-	for _, m := range metrics {
-		// filter by metric name filter(skip gpu validation)
-		if metricFilter != "" && strings.Contains(*m.MetricName, metricFilter) {
-			continue
-		}
-		var dims []string
-		for _, d := range m.Dimensions {
-			dims = append(dims, *d.Name)
+	// get list of metrics that has more dimensions for container insights
+	// this is to avoid adding more dimension provider for non-trivial dimensions e.g. PodName
+	listFetcher := metric.Fetcher{}
+	if slices.Contains(metricsWithMoreDimensions, name) {
+		metrics, err := listFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims)
+		if err != nil {
+			log.Println("failed to fetch metric list", err)
+			return testResult
 		}
-		sort.Sort(sort.StringSlice(dims)) //what's the point of sorting?
-		dimsKey := strings.Join(dims, dimDelimiter)
-		log.Printf("processing dims: %s", dimsKey)
 
-		var dtm dimToMetrics
-		for _, ele := range results {
-			if ele.dimStr == dimsKey {
-				dtm = ele
-				break
-			}
-		}
-		if dtm.dimStr == "" {
-			dtm = dimToMetrics{
-				dimStr:  dimsKey,
-				metrics: make(map[string][][]types.Dimension),
-			}
-			results = append(results, dtm)
+		if len(metrics) < 1 {
+			log.Println("metric list is empty")
+			return testResult
 		}
-		dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions)
-	}
-	return results
-}
-
-// Check if all metrics from cluster matches hard coded map
-func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult {
-	testResult := status.TestResult{
-		Name:   dims,
-		Status: status.FAILED,
-	}
-	log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual))
-	if compareMetrics(expected, actual) {
-		testResult.Status = status.SUCCESSFUL
-	} else {
-		log.Printf("validateMetricsAvailability failed for %s", dims)
-	}
-	return testResult
-}
 
-func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool {
-	if len(expected) != len(actual) {
-		return false
-	}
+		// just verify 1 of returned metrics for values
+		for _, dim := range metrics[0].Dimensions {
+			// skip since it's provided by dimension provider
+			if *dim.Name == "ClusterName" {
+				continue
+			}
 
-	for _, key := range expected {
-		if _, ok := actual[key]; !ok {
-			return false
+			dims = append(dims, types.Dimension{
+				Name:  dim.Name,
+				Value: dim.Value,
+			})
 		}
 	}
-	return true
-}
 
-func validateMetricValue(name string, dims []types.Dimension) status.TestResult {
-	log.Printf("validateMetricValue with metric: %s", name)
-	testResult := status.TestResult{
-		Name:   name,
-		Status: status.FAILED,
-	}
 	valueFetcher := metric.MetricValueFetcher{}
-	values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.SAMPLE_COUNT, metric.MinuteStatPeriod)
+	values, err := valueFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims, metric.AVERAGE, metric.HighResolutionStatPeriod)
 	if err != nil {
 		log.Println("failed to fetch metrics", err)
 		return testResult
@@ -195,60 +104,6 @@ func validateMetricValue(name string, dims []types.Dimension) status.TestResult
 	return testResult
 }
 
-func (e *EKSDaemonTestRunner) validateLogs(env *environment.MetaData) status.TestResult {
-	testResult := status.TestResult{
-		Name:   "emf-logs",
-		Status: status.FAILED,
-	}
-
-	now := time.Now()
-	group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
-
-	// need to get the instances used for the EKS cluster
-	eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
-	if err != nil {
-		log.Println("failed to get EKS instances", err)
-		return testResult
-	}
-
-	for _, instance := range eKSInstances {
-		stream := *instance.InstanceName
-		err = awsservice.ValidateLogs(
-			group,
-			stream,
-			nil,
-			&now,
-			awsservice.AssertLogsNotEmpty(),
-			awsservice.AssertNoDuplicateLogs(),
-			awsservice.AssertPerLog(
-				awsservice.AssertLogSchema(func(message string) (string, error) {
-					var eksClusterType awsservice.EKSClusterType
-					innerErr := json.Unmarshal([]byte(message), &eksClusterType)
-					if innerErr != nil {
-						return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr)
-					}
-
-					log.Printf("eksClusterType is: %s", eksClusterType.Type)
-					jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type]
-					if !ok {
-						return "", errors.New("invalid cluster type provided")
-					}
-					return jsonSchema, nil
-				}),
-				awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)),
-			),
-		)
-
-		if err != nil {
-			log.Printf("log validation (%s/%s) failed: %v", group, stream, err)
-			return testResult
-		}
-	}
-
-	testResult.Status = status.SUCCESSFUL
-	return testResult
-}
-
 func (e *EKSDaemonTestRunner) GetTestName() string {
 	return "EKSContainerInstance"
 }
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
new file mode 100644
index 000000000..5b14e3fb1
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
@@ -0,0 +1,21 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "structured log schema",
+  "description": "json schema for the cloudwatch agent k8s structured log",
+  "type": "object",
+  "properties": {
+    "CloudWatchMetrics": {},
+    "ClusterName": {},
+    "Timestamp": {},
+    "Type": {},
+    "Version": {},
+    "cluster_gpu_total": {},
+    "cluster_gpu_request": {},
+  },
+  "required": [
+    "ClusterName",
+    "Type",
+    "Version",
+    "CloudWatchMetrics"
+  ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
new file mode 100644
index 000000000..99c56e87f
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
@@ -0,0 +1,45 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "structured log schema",
+  "description": "json schema for the cloudwatch agent k8s structured log",
+  "type": "object",
+  "properties": {
+    "CloudWatchMetrics": {},
+    "ClusterName": {},
+    "ContainerName": {},
+    "FullPodName": {},
+    "GpuDevice": {},
+    "Hostname": {},
+    "InstanceId": {},
+    "K8sPodName": {},
+    "Namespace": {},
+    "NodeName": {},
+    "OTelLib": {},
+    "PodName": {},
+    "Timestamp": {},
+    "Type": {},
+    "UUID": {},
+    "Version": {},
+    "container_gpu_memory_total": {},
+    "container_gpu_memory_used": {},
+    "container_gpu_power_draw": {},
+    "container_gpu_temperature": {},
+    "container_gpu_utilization": {},
+    "container_gpu_memory_utilization": {},
+    "Service":{}
+  },
+  "required": [
+    "ClusterName",
+    "ContainerName",
+    "FullPodName",
+    "GpuDevice",
+    "InstanceId",
+    "Namespace",
+    "NodeName",
+    "PodName",
+    "Timestamp",
+    "Type",
+    "Version",
+    "CloudWatchMetrics"
+  ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
new file mode 100644
index 000000000..85df0952b
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
@@ -0,0 +1,44 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "structured log schema",
+  "description": "json schema for the cloudwatch agent k8s structured log",
+  "type": "object",
+  "properties": {
+    "CloudWatchMetrics": {},
+    "ClusterName": {},
+    "ContainerName": {},
+    "FullPodName": {},
+    "GpuDevice": {},
+    "Hostname": {},
+    "InstanceId": {},
+    "K8sPodName": {},
+    "Namespace": {},
+    "NodeName": {},
+    "OTelLib": {},
+    "PodName": {},
+    "Timestamp": {},
+    "Type": {},
+    "UUID": {},
+    "Version": {},
+    "node_gpu_memory_total": {},
+    "node_gpu_memory_used": {},
+    "node_gpu_power_draw": {},
+    "node_gpu_temperature": {},
+    "node_gpu_utilization": {},
+    "node_gpu_memory_utilization": {},
+    "node_gpu_total": {},
+    "node_gpu_request": {},
+    "node_gpu_list": {},
+    "Service":{}
+  },
+  "required": [
+    "ClusterName",
+    "GpuDevice",
+    "InstanceId",
+    "NodeName",
+    "Timestamp",
+    "Type",
+    "Version",
+    "CloudWatchMetrics"
+  ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json
new file mode 100644
index 000000000..4b532094f
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json
@@ -0,0 +1,47 @@
+{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "structured log schema",
+  "description": "json schema for the cloudwatch agent k8s structured log",
+  "type": "object",
+  "properties": {
+    "CloudWatchMetrics": {},
+    "ClusterName": {},
+    "ContainerName": {},
+    "FullPodName": {},
+    "GpuDevice": {},
+    "Hostname": {},
+    "InstanceId": {},
+    "K8sPodName": {},
+    "Namespace": {},
+    "NodeName": {},
+    "OTelLib": {},
+    "PodName": {},
+    "Timestamp": {},
+    "Type": {},
+    "UUID": {},
+    "Version": {},
+    "pod_gpu_memory_total": {},
+    "pod_gpu_memory_used": {},
+    "pod_gpu_power_draw": {},
+    "pod_gpu_temperature": {},
+    "pod_gpu_utilization": {},
+    "pod_gpu_memory_utilization": {},
+    "pod_gpu_total": {},
+    "pod_gpu_request": {},
+    "pod_gpu_list": {},
+    "Service":{}
+  },
+  "required": [
+    "ClusterName",
+    "FullPodName",
+    "GpuDevice",
+    "InstanceId",
+    "Namespace",
+    "NodeName",
+    "PodName",
+    "Timestamp",
+    "Type",
+    "Version",
+    "CloudWatchMetrics"
+  ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go
index 96e832aac..d398b81e1 100644
--- a/test/metric_value_benchmark/eks_resources/util.go
+++ b/test/metric_value_benchmark/eks_resources/util.go
@@ -38,6 +38,14 @@ var (
 	eksPodSchema string
 	//go:embed test_schemas/pod_net.json
 	eksPodNetSchema string
+	//go:embed test_schemas/container_gpu.json
+	eksContainerGpuSchema string
+	//go:embed test_schemas/pod_gpu.json
+	eksPodGpuSchema string
+	//go:embed test_schemas/node_gpu.json
+	eksNodeGpuSchema string
+	//go:embed test_schemas/cluster_gpu.json
+	eksClusterGpuSchema string
 
 	EksClusterValidationMap = map[string]string{
 		"Cluster":           eksClusterSchema,
@@ -54,6 +62,10 @@ var (
 		"NodeNet":           eksNodeNetSchema,
 		"Pod":               eksPodSchema,
 		"PodNet":            eksPodNetSchema,
+		"ContainerGPU":      eksContainerGpuSchema,
+		"PodGPU":            eksPodGpuSchema,
+		"NodeGPU":           eksNodeGpuSchema,
+		"ClusterGPU":        eksClusterGpuSchema,
 	}
 )
 
diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go
index c2fab7cac..59ef886b2 100644
--- a/util/awsservice/cloudwatchmetrics.go
+++ b/util/awsservice/cloudwatchmetrics.go
@@ -56,7 +56,7 @@ func ValidateMetric(metricName, namespace string, dimensionsFilter []types.Dimen
 	return nil
 }
 
-// ValidateMetrics takes the metric name, metric dimension and corresponding namespace that contains the metric
+// ValidateMetricWithTest takes the metric name, metric dimension and corresponding namespace that contains the metric
 func ValidateMetricWithTest(t *testing.T, metricName, namespace string, dimensionsFilter []types.DimensionFilter, retries int, retryTime time.Duration) {
 	var err error
 	for i := 0; i < retries; i++ {