From c638bfa49b51b75d909ceb1fe301b39703e9b99e Mon Sep 17 00:00:00 2001
From: Hyunsoo Kim <884273+movence@users.noreply.github.com>
Date: Mon, 8 Apr 2024 13:56:15 -0400
Subject: [PATCH] add nvidia gpu integ test (#399)
---
environment/metadata.go | 5 +
generator/test_case_generator.go | 4 +
terraform/eks/daemon/gpu/main.tf | 719 ++++++++++++++++++
terraform/eks/daemon/gpu/providers.tf | 17 +
terraform/eks/daemon/gpu/variables.tf | 37 +
test/gpu/gpu_test.go | 79 ++
test/gpu/nvidia_test.go | 118 +++
test/gpu/resources/config.json | 16 +
test/gpu/resources/httpd-ssl.conf | 43 ++
test/gpu/resources/httpd.conf | 101 +++
test/metric/container_insights_util.go | 222 ++++++
test/metric/metric_list_query.go | 30 +-
test/metric/stat.go | 3 +-
.../eks_daemonset_test.go | 231 ++----
.../test_schemas/cluster_gpu.json | 21 +
.../test_schemas/container_gpu.json | 45 ++
.../eks_resources/test_schemas/node_gpu.json | 44 ++
.../eks_resources/test_schemas/pod_gpu.json | 47 ++
.../eks_resources/util.go | 12 +
util/awsservice/cloudwatchmetrics.go | 2 +-
20 files changed, 1577 insertions(+), 219 deletions(-)
create mode 100644 terraform/eks/daemon/gpu/main.tf
create mode 100644 terraform/eks/daemon/gpu/providers.tf
create mode 100644 terraform/eks/daemon/gpu/variables.tf
create mode 100644 test/gpu/gpu_test.go
create mode 100644 test/gpu/nvidia_test.go
create mode 100644 test/gpu/resources/config.json
create mode 100644 test/gpu/resources/httpd-ssl.conf
create mode 100644 test/gpu/resources/httpd.conf
create mode 100644 test/metric/container_insights_util.go
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json
diff --git a/environment/metadata.go b/environment/metadata.go
index c9c0162ab..7a0e1e965 100644
--- a/environment/metadata.go
+++ b/environment/metadata.go
@@ -43,6 +43,7 @@ type MetaData struct {
InstanceId string
InstancePlatform string
AgentStartCommand string
+ EksGpuType string
}
type MetaDataStrings struct {
@@ -65,6 +66,7 @@ type MetaDataStrings struct {
InstanceId string
InstancePlatform string
AgentStartCommand string
+ EksGpuType string
}
func registerComputeType(dataString *MetaDataStrings) {
@@ -94,6 +96,7 @@ func registerECSData(dataString *MetaDataStrings) {
func registerEKSData(d *MetaDataStrings) {
flag.StringVar(&(d.EKSClusterName), "eksClusterName", "", "EKS cluster name")
flag.StringVar(&(d.EksDeploymentStrategy), "eksDeploymentStrategy", "", "Daemon/Replica/Sidecar")
+ flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/inferentia")
}
func registerPluginTestsToExecute(dataString *MetaDataStrings) {
@@ -210,6 +213,7 @@ func fillEKSData(e *MetaData, data *MetaDataStrings) {
}
e.EKSClusterName = data.EKSClusterName
+ e.EksGpuType = data.EksGpuType
}
func RegisterEnvironmentMetaDataFlags() *MetaDataStrings {
registerComputeType(registeredMetaDataStrings)
@@ -250,6 +254,7 @@ func GetEnvironmentMetaData() *MetaData {
metaDataStorage.InstanceId = registeredMetaDataStrings.InstanceId
metaDataStorage.InstancePlatform = registeredMetaDataStrings.InstancePlatform
metaDataStorage.AgentStartCommand = registeredMetaDataStrings.AgentStartCommand
+ metaDataStorage.EksGpuType = registeredMetaDataStrings.EksGpuType
return metaDataStorage
}
diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go
index e17b63e0d..056670d43 100644
--- a/generator/test_case_generator.go
+++ b/generator/test_case_generator.go
@@ -214,6 +214,10 @@ var testTypeToTestConfig = map[string][]testConfig{
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
{testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"},
+ {
+ testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
+ targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
+ },
},
"eks_deployment": {
{testDir: "./test/metric_value_benchmark"},
diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf
new file mode 100644
index 000000000..7952ef45a
--- /dev/null
+++ b/terraform/eks/daemon/gpu/main.tf
@@ -0,0 +1,719 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+ source = "../../../common"
+ cwagent_image_repo = var.cwagent_image_repo
+ cwagent_image_tag = var.cwagent_image_tag
+}
+
+module "basic_components" {
+ source = "../../../basic_components"
+
+ region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+ name = "cwagent-eks-integ-${module.common.testing_id}"
+ role_arn = module.basic_components.role_arn
+ version = var.k8s_version
+ enabled_cluster_log_types = [
+ "api",
+ "audit",
+ "authenticator",
+ "controllerManager",
+ "scheduler"
+ ]
+ vpc_config {
+ subnet_ids = module.basic_components.public_subnet_ids
+ security_group_ids = [module.basic_components.security_group]
+ }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+ cluster_name = aws_eks_cluster.this.name
+ node_group_name = "cwagent-eks-integ-node"
+ node_role_arn = aws_iam_role.node_role.arn
+ subnet_ids = module.basic_components.public_subnet_ids
+
+ scaling_config {
+ desired_size = 1
+ max_size = 1
+ min_size = 1
+ }
+
+ ami_type = "AL2_x86_64"
+ capacity_type = "ON_DEMAND"
+ disk_size = 20
+ instance_types = ["t3.medium"]
+
+ depends_on = [
+ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+ aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+ aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+ ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+ name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Effect = "Allow",
+ Principal = {
+ Service = "ec2.amazonaws.com"
+ },
+ Action = "sts:AssumeRole"
+ }
+ ]
+ })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+ role = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+ name = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+ description = "Cluster communication with worker nodes"
+ vpc_id = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+ description = "Allow worker nodes to communicate with the cluster API Server"
+ from_port = 443
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 443
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+ description = "Allow cluster API Server to communicate with the worker nodes"
+ from_port = 1024
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+ name = "cwagent-eks-node-sg-${module.common.testing_id}"
+ description = "Security group for all nodes in the cluster"
+ vpc_id = module.basic_components.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+ description = "Allow nodes to communicate with each other"
+ from_port = 0
+ protocol = "-1"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+ description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+ from_port = 1025
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_cluster_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+
+# create cert for communication between agent and dcgm
+resource "tls_private_key" "private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+ content = tls_private_key.private_key.private_key_pem
+ filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+ private_key_pem = tls_private_key.private_key.private_key_pem
+ is_ca_certificate = true
+ subject {
+ common_name = "dcgm-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+ validity_period_hours = 24
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "cert_signing",
+ "crl_signing",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "ca_cert_file" {
+ content = tls_self_signed_cert.ca_cert.cert_pem
+ filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+ content = tls_private_key.server_private_key.private_key_pem
+ filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+ private_key_pem = tls_private_key.server_private_key.private_key_pem
+ dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"]
+ subject {
+ common_name = "dcgm-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+ cert_request_pem = tls_cert_request.local_csr.cert_request_pem
+ ca_private_key_pem = tls_private_key.private_key.private_key_pem
+ ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem
+ validity_period_hours = 12
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "server_cert_file" {
+ content = tls_locally_signed_cert.server_cert.cert_pem
+ filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+ metadata {
+ name = "amazon-cloudwatch-observability-agent-cert"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename)
+ "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename)
+ "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+ }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+ metadata {
+ name = "amazon-cloudwatch"
+ }
+}
+
+# dummy daemonset that simulates dcgm-exporter assuming there is only 1 GPU available
+resource "kubernetes_daemonset" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_config_map.httpdconfig,
+ ]
+ metadata {
+ name = "dcgm-exporter"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ k8s-app = "dcgm-exporter"
+ }
+ }
+ spec {
+ selector {
+ match_labels = {
+ "k8s-app" = "dcgm-exporter"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "dcgm-exporter"
+ "k8s-app" : "dcgm-exporter"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "dcgm-exporter"
+ image = "httpd:2.4-alpine"
+ resources {
+ limits = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ requests = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ }
+ port {
+ name = "metrics"
+ container_port = 9400
+ host_port = 9400
+ protocol = "TCP"
+ }
+ command = [
+ "/bin/sh",
+ "-c",
+ ]
+ args = [
+ "/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
+ ]
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert"
+ name = "dcgmtls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/httpd.conf"
+ sub_path = "httpd.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf"
+ sub_path = "httpd-ssl.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ }
+ volume {
+ name = "dcgmtls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "tls.crt"
+ path = "server.crt"
+ }
+ items {
+ key = "tls.key"
+ path = "server.key"
+ }
+ }
+ }
+ volume {
+ name = "httpdconfig"
+ config_map {
+ name = "httpdconfig"
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+resource "kubernetes_service" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.exporter
+ ]
+ metadata {
+ name = "dcgm-exporter-service"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ "k8s-app" : "dcgm-exporter-service"
+ }
+ annotations = {
+ "prometheus.io/scrape" : "true"
+ }
+ }
+ spec {
+ type = "ClusterIP"
+ selector = {
+ k8s-app = "dcgm-exporter"
+ }
+ port {
+ name = "metrics"
+ port = 9400
+ target_port = 9400
+ protocol = "TCP"
+ }
+ }
+}
+
+resource "kubernetes_daemonset" "service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_service.exporter
+ ]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+ spec {
+ selector {
+ match_labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "cwagent"
+ image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+ image_pull_policy = "Always"
+ resources {
+ limits = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ requests = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ }
+ port {
+ container_port = 25888
+ host_port = 25888
+ protocol = "UDP"
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ volume_mount {
+ mount_path = "/etc/cwagentconfig"
+ name = "cwagentconfig"
+ }
+ volume_mount {
+ mount_path = "/rootfs"
+ name = "rootfs"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/run/docker.sock"
+ name = "dockersock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/docker"
+ name = "varlibdocker"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/run/containerd/containerd.sock"
+ name = "containerdsock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/sys"
+ name = "sys"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/dev/disk"
+ name = "devdisk"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+ name = "agenttls"
+ read_only = true
+ }
+ }
+ volume {
+ name = "cwagentconfig"
+ config_map {
+ name = "cwagentconfig"
+ }
+ }
+ volume {
+ name = "rootfs"
+ host_path {
+ path = "/"
+ }
+ }
+ volume {
+ name = "dockersock"
+ host_path {
+ path = "/var/run/docker.sock"
+ }
+ }
+ volume {
+ name = "varlibdocker"
+ host_path {
+ path = "/var/lib/docker"
+ }
+ }
+ volume {
+ name = "containerdsock"
+ host_path {
+ path = "/run/containerd/containerd.sock"
+ }
+ }
+ volume {
+ name = "sys"
+ host_path {
+ path = "/sys"
+ }
+ }
+ volume {
+ name = "devdisk"
+ host_path {
+ path = "/dev/disk"
+ }
+ }
+ volume {
+ name = "agenttls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "ca.crt"
+ path = "tls-ca.crt"
+ }
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+ httpd_config = "../../../../${var.test_dir}/resources/httpd.conf"
+ httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+ cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+ template = file(local.cwagent_config)
+ vars = {
+ }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "cwagentconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+ }
+}
+
+data "template_file" "httpd_config" {
+ template = file(local.httpd_config)
+ vars = {}
+}
+data "template_file" "httpd_ssl_config" {
+ template = file(local.httpd_ssl_config)
+ vars = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "httpdconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "httpd.conf" : data.template_file.httpd_config.rendered
+ "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+ }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role"
+ }
+ rule {
+ verbs = ["get", "list", "watch"]
+ resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["replicasets"]
+ api_groups = ["apps"]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["jobs"]
+ api_groups = ["batch"]
+ }
+ rule {
+ verbs = ["get"]
+ resources = ["nodes/proxy"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["create"]
+ resources = ["nodes/stats", "configmaps", "events"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get", "update"]
+ resource_names = ["cwagent-clusterleader"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["services"]
+ api_groups = [""]
+ }
+ rule {
+ non_resource_urls = ["/metrics"]
+ verbs = ["get", "list", "watch"]
+ }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role-binding"
+ }
+ role_ref {
+ api_group = "rbac.authorization.k8s.io"
+ kind = "ClusterRole"
+ name = "cloudwatch-agent-role"
+ }
+ subject {
+ kind = "ServiceAccount"
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "null_resource" "validator" {
+ depends_on = [
+ aws_eks_node_group.this,
+ kubernetes_daemonset.service,
+ kubernetes_cluster_role_binding.rolebinding,
+ kubernetes_service_account.cwagentservice,
+ ]
+ provisioner "local-exec" {
+ command = <<-EOT
+ echo "Validating EKS metrics/logs for EMF"
+ cd ../../../..
+ go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia
+ EOT
+ }
+}
diff --git a/terraform/eks/daemon/gpu/providers.tf b/terraform/eks/daemon/gpu/providers.tf
new file mode 100644
index 000000000..9bd2885f5
--- /dev/null
+++ b/terraform/eks/daemon/gpu/providers.tf
@@ -0,0 +1,17 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+provider "aws" {
+ region = var.region
+}
+
+provider "kubernetes" {
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ command = "aws"
+ args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
+ }
+ host = aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.this.token
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf
new file mode 100644
index 000000000..26a0e6cd0
--- /dev/null
+++ b/terraform/eks/daemon/gpu/variables.tf
@@ -0,0 +1,37 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+variable "region" {
+ type = string
+ default = "us-west-2"
+}
+
+variable "test_dir" {
+ type = string
+ default = "./test/gpu"
+}
+
+variable "cwagent_image_repo" {
+ type = string
+ default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
+}
+
+variable "cwagent_image_tag" {
+ type = string
+ default = "latest"
+}
+
+variable "k8s_version" {
+ type = string
+ default = "1.28"
+}
+
+variable "ami_type" {
+ type = string
+ default = "AL2_x86_64"
+}
+
+variable "instance_type" {
+ type = string
+ default = "g4dn.xlarge"
+}
\ No newline at end of file
diff --git a/test/gpu/gpu_test.go b/test/gpu/gpu_test.go
new file mode 100644
index 000000000..352f9cad1
--- /dev/null
+++ b/test/gpu/gpu_test.go
@@ -0,0 +1,79 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+ "fmt"
+ "log"
+ "testing"
+
+ "github.com/stretchr/testify/suite"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+type GPUTestSuite struct {
+ suite.Suite
+ test_runner.TestSuite
+}
+
+func (suite *GPUTestSuite) SetupSuite() {
+ fmt.Println(">>>> Starting GPU Container Insights TestSuite")
+}
+
+func (suite *GPUTestSuite) TearDownSuite() {
+ suite.Result.Print()
+ fmt.Println(">>>> Finished GPU Container Insights TestSuite")
+}
+
+func init() {
+ environment.RegisterEnvironmentMetaDataFlags()
+}
+
+var (
+ eksTestRunners []*test_runner.EKSTestRunner
+)
+
+func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner {
+ if eksTestRunners == nil {
+ factory := dimension.GetDimensionFactory(*env)
+
+ eksTestRunners = []*test_runner.EKSTestRunner{
+ {
+ Runner: &NvidiaTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_GPU_NVIDIA", env},
+ Env: *env,
+ },
+ }
+ }
+ return eksTestRunners
+}
+
+func (suite *GPUTestSuite) TestAllInSuite() {
+ env := environment.GetEnvironmentMetaData()
+ switch env.ComputeType {
+ case computetype.EKS:
+ log.Println("Environment compute type is EKS")
+ for _, testRunner := range getEksTestRunners(env) {
+ testRunner.Run(suite, env)
+ }
+ default:
+ return
+ }
+
+ suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "GPU Container Test Suite Failed")
+}
+
+func (suite *GPUTestSuite) AddToSuiteResult(r status.TestGroupResult) {
+ suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r)
+}
+
+func TestGPUSuite(t *testing.T) {
+ suite.Run(t, new(GPUTestSuite))
+}
diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go
new file mode 100644
index 000000000..ced990b36
--- /dev/null
+++ b/test/gpu/nvidia_test.go
@@ -0,0 +1,118 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+ "time"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+const (
+ gpuMetricIndicator = "_gpu_"
+
+ containerMemTotal = "container_gpu_memory_total"
+ containerMemUsed = "container_gpu_memory_used"
+ containerPower = "container_gpu_power_draw"
+ containerTemp = "container_gpu_temperature"
+ containerUtil = "container_gpu_utilization"
+ containerMemUtil = "container_gpu_memory_utilization"
+ podMemTotal = "pod_gpu_memory_total"
+ podMemUsed = "pod_gpu_memory_used"
+ podPower = "pod_gpu_power_draw"
+ podTemp = "pod_gpu_temperature"
+ podUtil = "pod_gpu_utilization"
+ podMemUtil = "pod_gpu_memory_utilization"
+ nodeMemTotal = "node_gpu_memory_total"
+ nodeMemUsed = "node_gpu_memory_used"
+ nodePower = "node_gpu_power_draw"
+ nodeTemp = "node_gpu_temperature"
+ nodeUtil = "node_gpu_utilization"
+ nodeMemUtil = "node_gpu_memory_utilization"
+ nodeCountTotal = "node_gpu_total"
+ nodeCountRequest = "node_gpu_request"
+ nodeCountLimit = "node_gpu_limit"
+ clusterCountTotal = "cluster_gpu_total"
+ clusterCountRequest = "cluster_gpu_request"
+)
+
+var expectedDimsToMetrics = map[string][]string{
+ "ClusterName": {
+ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+ podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+ //nodeCountTotal, nodeCountRequest, nodeCountLimit,
+ //clusterCountTotal, clusterCountRequest,
+ },
+ "ClusterName-Namespace": {
+ podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ },
+ //"ClusterName-Namespace-Service": {
+ // podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ //},
+ "ClusterName-Namespace-PodName": {
+ podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ },
+ "ClusterName-ContainerName-Namespace-PodName": {
+ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+ },
+ "ClusterName-ContainerName-FullPodName-Namespace-PodName": {
+ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+ },
+ "ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
+ containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
+ },
+ "ClusterName-FullPodName-Namespace-PodName": {
+ podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ },
+ "ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
+ podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
+ },
+ "ClusterName-InstanceId-NodeName": {
+ nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+ //nodeCountTotal, nodeCountRequest, nodeCountLimit,
+ },
+ "ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
+ nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
+ },
+}
+
+type NvidiaTestRunner struct {
+ test_runner.BaseTestRunner
+ testName string
+ env *environment.MetaData
+}
+
+var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil)
+
+func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
+ var testResults []status.TestResult
+ testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
+ testResults = append(testResults, metric.ValidateLogs(t.env))
+ return status.TestGroupResult{
+ Name: t.GetTestName(),
+ TestResults: testResults,
+ }
+}
+
+func (t *NvidiaTestRunner) GetTestName() string {
+ return t.testName
+}
+
+func (t *NvidiaTestRunner) GetAgentConfigFileName() string {
+ return ""
+}
+
+func (t *NvidiaTestRunner) GetAgentRunDuration() time.Duration {
+ return 3 * time.Minute
+}
+
+func (t *NvidiaTestRunner) GetMeasuredMetrics() []string {
+ return nil
+}
diff --git a/test/gpu/resources/config.json b/test/gpu/resources/config.json
new file mode 100644
index 000000000..6f37e43ed
--- /dev/null
+++ b/test/gpu/resources/config.json
@@ -0,0 +1,16 @@
+{
+ "agent": {
+ "metrics_collection_interval": 15,
+ "run_as_user": "root",
+ "debug": true,
+ "logfile": ""
+ },
+ "logs": {
+ "metrics_collected": {
+ "kubernetes": {
+ "enhanced_container_insights": true
+ }
+ },
+ "force_flush_interval": 5
+ }
+}
\ No newline at end of file
diff --git a/test/gpu/resources/httpd-ssl.conf b/test/gpu/resources/httpd-ssl.conf
new file mode 100644
index 000000000..8e441a2cd
--- /dev/null
+++ b/test/gpu/resources/httpd-ssl.conf
@@ -0,0 +1,43 @@
+Listen 9400
+
+SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+
+SSLHonorCipherOrder on
+
+SSLProtocol all -SSLv3
+SSLProxyProtocol all -SSLv3
+
+SSLPassPhraseDialog builtin
+
+SSLSessionCache "shmcb:/usr/local/apache2/logs/ssl_scache(512000)"
+SSLSessionCacheTimeout 300
+
+
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+ServerName dcgm-exporter-service.amazon-cloudwatch.svc:9400
+ServerAdmin you@example.com
+ErrorLog /proc/self/fd/2
+TransferLog /proc/self/fd/1
+
+SSLEngine on
+SSLCertificateFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.crt"
+SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.key"
+
+
+ SSLOptions +StdEnvVars
+
+
+ SSLOptions +StdEnvVars
+
+
+BrowserMatch "MSIE [2-5]" \
+ nokeepalive ssl-unclean-shutdown \
+ downgrade-1.0 force-response-1.0
+
+CustomLog /proc/self/fd/1 \
+ "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b"
+
+
\ No newline at end of file
diff --git a/test/gpu/resources/httpd.conf b/test/gpu/resources/httpd.conf
new file mode 100644
index 000000000..058db5063
--- /dev/null
+++ b/test/gpu/resources/httpd.conf
@@ -0,0 +1,101 @@
+
+ServerRoot "/usr/local/apache2"
+
+#Listen 9400
+
+LoadModule mpm_event_module modules/mod_mpm_event.so
+LoadModule authn_file_module modules/mod_authn_file.so
+LoadModule authn_core_module modules/mod_authn_core.so
+LoadModule authz_host_module modules/mod_authz_host.so
+LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+LoadModule authz_user_module modules/mod_authz_user.so
+LoadModule authz_core_module modules/mod_authz_core.so
+LoadModule access_compat_module modules/mod_access_compat.so
+LoadModule auth_basic_module modules/mod_auth_basic.so
+LoadModule socache_shmcb_module modules/mod_socache_shmcb.so
+LoadModule reqtimeout_module modules/mod_reqtimeout.so
+LoadModule filter_module modules/mod_filter.so
+LoadModule mime_module modules/mod_mime.so
+LoadModule log_config_module modules/mod_log_config.so
+LoadModule env_module modules/mod_env.so
+LoadModule headers_module modules/mod_headers.so
+LoadModule setenvif_module modules/mod_setenvif.so
+LoadModule version_module modules/mod_version.so
+LoadModule ssl_module modules/mod_ssl.so
+LoadModule unixd_module modules/mod_unixd.so
+LoadModule status_module modules/mod_status.so
+LoadModule autoindex_module modules/mod_autoindex.so
+LoadModule dir_module modules/mod_dir.so
+LoadModule alias_module modules/mod_alias.so
+
+
+User www-data
+Group www-data
+
+
+
+ AllowOverride none
+ Require all denied
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+
+ Options Indexes FollowSymLinks
+ AllowOverride None
+ Require all granted
+
+
+
+ DirectoryIndex index.html
+
+
+
+ Require all denied
+
+
+ErrorLog /proc/self/fd/2
+
+LogLevel warn
+
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined
+ LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+
+ # You need to enable mod_logio.c to use %I and %O
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio
+
+
+ CustomLog /proc/self/fd/1 common
+
+
+
+ ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/"
+
+
+
+ AllowOverride None
+ Options None
+ Require all granted
+
+
+
+ RequestHeader unset Proxy early
+
+
+
+ TypesConfig conf/mime.types
+ AddType application/x-compress .Z
+ AddType application/x-gzip .gz .tgz
+
+
+
+Include conf/extra/proxy-html.conf
+
+
+# Secure (SSL/TLS) connections
+Include conf/extra/httpd-ssl.conf
+
+SSLRandomSeed startup builtin
+SSLRandomSeed connect builtin
+
\ No newline at end of file
diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go
new file mode 100644
index 000000000..140c839c1
--- /dev/null
+++ b/test/metric/container_insights_util.go
@@ -0,0 +1,222 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package metric
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "log"
+ "math/rand"
+ "sort"
+ "strings"
+ "time"
+
+ "github.com/aws/aws-sdk-go-v2/aws"
+ "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
+)
+
+const (
+ dimDelimiter = "-"
+ ContainerInsightsNamespace = "ContainerInsights"
+)
+
+type dimToMetrics struct {
+ // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace
+ dimStr string
+ // metric names to their dimensions with values. Dimension sets will be used for metric data validations
+ metrics map[string][][]types.Dimension
+}
+
+func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult {
+ var results []status.TestResult
+ dimsToMetrics := getMetricsInClusterDimension(env, metricFilter)
+ for dims, metrics := range expectedDimsToMetrics {
+ var actual map[string][][]types.Dimension
+ for _, dtm := range dimsToMetrics {
+ if dtm.dimStr == dims {
+ actual = dtm.metrics
+ break
+ }
+ }
+ if len(actual) < 1 {
+ results = append(results, status.TestResult{
+ Name: dims,
+ Status: status.FAILED,
+ })
+ log.Printf("ValidateMetrics failed with missing dimension set: %s", dims)
+ // keep testing other dims or fail early?
+ continue
+ }
+ results = append(results, validateMetricsAvailability(dims, metrics, actual))
+ for _, m := range metrics {
+ // pick a random dimension set to test metric data OR test all dimension sets which might be overkill
+ randIdx := rand.Intn(len(actual[m]))
+ results = append(results, validateMetricValue(m, actual[m][randIdx]))
+ }
+ }
+ return results
+}
+
+func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} {
+ listFetcher := Fetcher{}
+ log.Printf("Fetching by cluster dimension")
+ dims := []types.Dimension{
+ {
+ Name: aws.String("ClusterName"),
+ Value: aws.String(env.EKSClusterName),
+ },
+ }
+ metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims)
+ if err != nil {
+ log.Println("failed to fetch metric list", err)
+ return nil
+ }
+ if len(metrics) < 1 {
+ log.Println("cloudwatch metric list is empty")
+ return nil
+ }
+
+ var results []dimToMetrics
+ for _, m := range metrics {
+ // filter by metric name filter
+ if metricFilter != "" && !strings.Contains(*m.MetricName, metricFilter) {
+ continue
+ }
+ var dims []string
+ for _, d := range m.Dimensions {
+ dims = append(dims, *d.Name)
+ }
+ sort.Sort(sort.StringSlice(dims))
+ dimsKey := strings.Join(dims, dimDelimiter)
+ log.Printf("processing dims: %s", dimsKey)
+
+ var dtm dimToMetrics
+ for _, ele := range results {
+ if ele.dimStr == dimsKey {
+ dtm = ele
+ break
+ }
+ }
+ if dtm.dimStr == "" {
+ dtm = dimToMetrics{
+ dimStr: dimsKey,
+ metrics: make(map[string][][]types.Dimension),
+ }
+ results = append(results, dtm)
+ }
+ dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions)
+ }
+ return results
+}
+
+func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult {
+ testResult := status.TestResult{
+ Name: dims,
+ Status: status.FAILED,
+ }
+ log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual))
+ if compareMetrics(expected, actual) {
+ testResult.Status = status.SUCCESSFUL
+ } else {
+ log.Printf("validateMetricsAvailability failed for %s", dims)
+ }
+ return testResult
+}
+
+func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool {
+ if len(expected) != len(actual) {
+ return false
+ }
+
+ for _, key := range expected {
+ if _, ok := actual[key]; !ok {
+ return false
+ }
+ }
+ return true
+}
+
+func validateMetricValue(name string, dims []types.Dimension) status.TestResult {
+ log.Printf("validateMetricValue with metric: %s", name)
+ testResult := status.TestResult{
+ Name: name,
+ Status: status.FAILED,
+ }
+ valueFetcher := MetricValueFetcher{}
+ values, err := valueFetcher.Fetch(ContainerInsightsNamespace, name, dims, SAMPLE_COUNT, MinuteStatPeriod)
+ if err != nil {
+ log.Println("failed to fetch metrics", err)
+ return testResult
+ }
+
+ if !IsAllValuesGreaterThanOrEqualToExpectedValue(name, values, 0) {
+ return testResult
+ }
+
+ testResult.Status = status.SUCCESSFUL
+ return testResult
+}
+
+func ValidateLogs(env *environment.MetaData) status.TestResult {
+ testResult := status.TestResult{
+ Name: "emf-logs",
+ Status: status.FAILED,
+ }
+
+ end := time.Now()
+ start := end.Add(time.Duration(-3) * time.Minute)
+ group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
+
+ // need to get the instances used for the EKS cluster
+ eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
+ if err != nil {
+ log.Println("failed to get EKS instances", err)
+ return testResult
+ }
+
+ for _, instance := range eKSInstances {
+ stream := *instance.InstanceName
+ err = awsservice.ValidateLogs(
+ group,
+ stream,
+ &start,
+ &end,
+ awsservice.AssertLogsNotEmpty(),
+ //awsservice.AssertNoDuplicateLogs(),
+ awsservice.AssertPerLog(
+ awsservice.AssertLogSchema(func(message string) (string, error) {
+ var eksClusterType awsservice.EKSClusterType
+ innerErr := json.Unmarshal([]byte(message), &eksClusterType)
+ if innerErr != nil {
+ return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr)
+ }
+
+ //log.Printf("eksClusterType is: %s", eksClusterType.Type)
+ jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type]
+ if !ok {
+ return "", errors.New("invalid cluster type provided")
+ }
+ return jsonSchema, nil
+ }),
+ awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)),
+ ),
+ )
+
+ if err != nil {
+ log.Printf("log validation (%s/%s) failed: %v", group, stream, err)
+ return testResult
+ }
+ }
+
+ testResult.Status = status.SUCCESSFUL
+ return testResult
+}
diff --git a/test/metric/metric_list_query.go b/test/metric/metric_list_query.go
index 8a3804efe..afe07f6c1 100644
--- a/test/metric/metric_list_query.go
+++ b/test/metric/metric_list_query.go
@@ -17,10 +17,10 @@ import (
"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
)
-type MetricListFetcher struct {
+type Fetcher struct {
}
-func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) {
+func (n *Fetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) {
var dims []types.DimensionFilter
for _, dim := range dimensions {
dims = append(dims, types.DimensionFilter{
@@ -56,29 +56,3 @@ func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []typ
log.Printf("total number of metrics fetched: %v", len(metrics))
return metrics, nil
}
-
-func (n *MetricListFetcher) FetchByDimension(namespace string, dimensions []types.Dimension) ([]types.Metric, error) {
- var dims []types.DimensionFilter
- for _, dim := range dimensions {
- dims = append(dims, types.DimensionFilter{
- Name: dim.Name,
- Value: dim.Value,
- })
- }
-
- listMetricInput := cloudwatch.ListMetricsInput{
- Namespace: aws.String(namespace),
- Dimensions: dims,
- }
-
- log.Printf("Metric data input: namespace %v, dimensions %v", namespace, fmt.Sprint(&dims))
-
- output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput)
- if err != nil {
- return nil, fmt.Errorf("Error getting metric data %v", err)
- }
-
- log.Printf("Metrics fetched : %v", output.Metrics)
-
- return output.Metrics, nil
-}
diff --git a/test/metric/stat.go b/test/metric/stat.go
index d633985d3..a6dc5d25e 100644
--- a/test/metric/stat.go
+++ b/test/metric/stat.go
@@ -13,6 +13,5 @@ const (
MAXUMUM Statistics = "Maxmimum"
SUM Statistics = "Sum"
HighResolutionStatPeriod = 10
-
- MinuteStatPeriod = 60
+ MinuteStatPeriod = 60
)
diff --git a/test/metric_value_benchmark/eks_daemonset_test.go b/test/metric_value_benchmark/eks_daemonset_test.go
index 9572bd8bd..d7eb996f2 100644
--- a/test/metric_value_benchmark/eks_daemonset_test.go
+++ b/test/metric_value_benchmark/eks_daemonset_test.go
@@ -6,28 +6,22 @@
package metric_value_benchmark
import (
- "encoding/json"
- "errors"
- "fmt"
"log"
- "math/rand"
- "sort"
- "strings"
"time"
- "github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
+ "golang.org/x/exp/slices"
"github.com/aws/amazon-cloudwatch-agent-test/environment"
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
"github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources"
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
- "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
)
-const containerInsightsNamespace = "ContainerInsights"
-const gpuMetricIndicator = "_gpu_"
+// list of metrics with more dimensions e.g. PodName and Namespace
+var metricsWithMoreDimensions = []string{"pod_number_of_container_restarts"}
type EKSDaemonTestRunner struct {
test_runner.BaseTestRunner
@@ -37,151 +31,66 @@ type EKSDaemonTestRunner struct {
func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult {
var testResults []status.TestResult
- testResults = append(testResults, validateMetrics(e.env, gpuMetricIndicator, eks_resources.GetExpectedDimsToMetrics(e.env))...)
- testResults = append(testResults, e.validateLogs(e.env))
+ testResults = append(testResults, metric.ValidateMetrics(e.env, "", eks_resources.GetExpectedDimsToMetrics(e.env))...)
+ metrics := e.GetMeasuredMetrics()
+ for _, name := range metrics {
+ testResults = append(testResults, e.validateInstanceMetrics(name))
+ }
+ testResults = append(testResults, metric.ValidateLogs(e.env))
return status.TestGroupResult{
Name: e.GetTestName(),
TestResults: testResults,
}
}
-const (
- dimDelimiter = "-"
- ContainerInsightsNamespace = "ContainerInsights"
-)
-
-type dimToMetrics struct {
- // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace
- dimStr string
- // metric names to their dimensions with values. Dimension sets will be used for metric data validations
- metrics map[string][][]types.Dimension
-}
-
-func validateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult {
- var results []status.TestResult
- dimsToMetrics := getMetricsInClusterDimension(env, metricFilter)
- //loops through each dimension set and checks if they exit in the cluster(fails if it doesn't)
- for dims, metrics := range expectedDimsToMetrics {
- var actual map[string][][]types.Dimension
- //looping through dtms until we find the dimension string equal to the one in the hard coded map
- for _, dtm := range dimsToMetrics {
- log.Printf("dtm: %s vs dims %s", dtm.dimStr, dims) //testing purposes
- if dtm.dimStr == dims {
- actual = dtm.metrics
- break
- }
- }
- //if there are no metrics for the dimension set, we fail the test
- if len(actual) < 1 {
- results = append(results, status.TestResult{
- Name: dims,
- Status: status.FAILED,
- })
- log.Printf("ValidateMetrics failed with missing dimension set: %s", dims)
- // keep testing other dims or fail early?
- continue
- }
- //verifies length of metrics for dimension set
- results = append(results, validateMetricsAvailability(dims, metrics, actual))
- for _, m := range metrics {
- // picking a random dimension set to test metric data so we don't have to test every dimension set
- randIdx := rand.Intn(len(actual[m]))
- //verifys values of metrics
- results = append(results, validateMetricValue(m, actual[m][randIdx]))
- }
+func (e *EKSDaemonTestRunner) validateInstanceMetrics(name string) status.TestResult {
+ testResult := status.TestResult{
+ Name: name,
+ Status: status.FAILED,
}
- return results
-}
-// Fetches all metrics in cluster
-func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} {
- listFetcher := metric.MetricListFetcher{}
- log.Printf("Fetching by cluster dimension")
- dims := []types.Dimension{
+ dims, failed := e.DimensionFactory.GetDimensions([]dimension.Instruction{
{
- Name: aws.String("ClusterName"),
- Value: aws.String(env.EKSClusterName),
+ Key: "ClusterName",
+ Value: dimension.UnknownDimensionValue(),
},
- }
- metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims)
- if err != nil {
- log.Println("failed to fetch metric list", err)
- return nil
- }
- if len(metrics) < 1 {
- log.Println("cloudwatch metric list is empty")
- return nil
+ })
+ if len(failed) > 0 {
+ log.Println("failed to get dimensions")
+ return testResult
}
- var results []dimToMetrics
- for _, m := range metrics {
- // filter by metric name filter(skip gpu validation)
- if metricFilter != "" && strings.Contains(*m.MetricName, metricFilter) {
- continue
- }
- var dims []string
- for _, d := range m.Dimensions {
- dims = append(dims, *d.Name)
+ // get list of metrics that has more dimensions for container insights
+ // this is to avoid adding more dimension provider for non-trivial dimensions e.g. PodName
+ listFetcher := metric.Fetcher{}
+ if slices.Contains(metricsWithMoreDimensions, name) {
+ metrics, err := listFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims)
+ if err != nil {
+ log.Println("failed to fetch metric list", err)
+ return testResult
}
- sort.Sort(sort.StringSlice(dims)) //what's the point of sorting?
- dimsKey := strings.Join(dims, dimDelimiter)
- log.Printf("processing dims: %s", dimsKey)
- var dtm dimToMetrics
- for _, ele := range results {
- if ele.dimStr == dimsKey {
- dtm = ele
- break
- }
- }
- if dtm.dimStr == "" {
- dtm = dimToMetrics{
- dimStr: dimsKey,
- metrics: make(map[string][][]types.Dimension),
- }
- results = append(results, dtm)
+ if len(metrics) < 1 {
+ log.Println("metric list is empty")
+ return testResult
}
- dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions)
- }
- return results
-}
-
-// Check if all metrics from cluster matches hard coded map
-func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult {
- testResult := status.TestResult{
- Name: dims,
- Status: status.FAILED,
- }
- log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual))
- if compareMetrics(expected, actual) {
- testResult.Status = status.SUCCESSFUL
- } else {
- log.Printf("validateMetricsAvailability failed for %s", dims)
- }
- return testResult
-}
-func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool {
- if len(expected) != len(actual) {
- return false
- }
+ // just verify 1 of returned metrics for values
+ for _, dim := range metrics[0].Dimensions {
+ // skip since it's provided by dimension provider
+ if *dim.Name == "ClusterName" {
+ continue
+ }
- for _, key := range expected {
- if _, ok := actual[key]; !ok {
- return false
+ dims = append(dims, types.Dimension{
+ Name: dim.Name,
+ Value: dim.Value,
+ })
}
}
- return true
-}
-func validateMetricValue(name string, dims []types.Dimension) status.TestResult {
- log.Printf("validateMetricValue with metric: %s", name)
- testResult := status.TestResult{
- Name: name,
- Status: status.FAILED,
- }
valueFetcher := metric.MetricValueFetcher{}
- values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.SAMPLE_COUNT, metric.MinuteStatPeriod)
+ values, err := valueFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims, metric.AVERAGE, metric.HighResolutionStatPeriod)
if err != nil {
log.Println("failed to fetch metrics", err)
return testResult
@@ -195,60 +104,6 @@ func validateMetricValue(name string, dims []types.Dimension) status.TestResult
return testResult
}
-func (e *EKSDaemonTestRunner) validateLogs(env *environment.MetaData) status.TestResult {
- testResult := status.TestResult{
- Name: "emf-logs",
- Status: status.FAILED,
- }
-
- now := time.Now()
- group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
-
- // need to get the instances used for the EKS cluster
- eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
- if err != nil {
- log.Println("failed to get EKS instances", err)
- return testResult
- }
-
- for _, instance := range eKSInstances {
- stream := *instance.InstanceName
- err = awsservice.ValidateLogs(
- group,
- stream,
- nil,
- &now,
- awsservice.AssertLogsNotEmpty(),
- awsservice.AssertNoDuplicateLogs(),
- awsservice.AssertPerLog(
- awsservice.AssertLogSchema(func(message string) (string, error) {
- var eksClusterType awsservice.EKSClusterType
- innerErr := json.Unmarshal([]byte(message), &eksClusterType)
- if innerErr != nil {
- return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr)
- }
-
- log.Printf("eksClusterType is: %s", eksClusterType.Type)
- jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type]
- if !ok {
- return "", errors.New("invalid cluster type provided")
- }
- return jsonSchema, nil
- }),
- awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)),
- ),
- )
-
- if err != nil {
- log.Printf("log validation (%s/%s) failed: %v", group, stream, err)
- return testResult
- }
- }
-
- testResult.Status = status.SUCCESSFUL
- return testResult
-}
-
func (e *EKSDaemonTestRunner) GetTestName() string {
return "EKSContainerInstance"
}
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
new file mode 100644
index 000000000..5b14e3fb1
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json
@@ -0,0 +1,21 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "cluster_gpu_total": {},
+ "cluster_gpu_request": {},
+ },
+ "required": [
+ "ClusterName",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
new file mode 100644
index 000000000..99c56e87f
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json
@@ -0,0 +1,45 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "ContainerName": {},
+ "FullPodName": {},
+ "GpuDevice": {},
+ "Hostname": {},
+ "InstanceId": {},
+ "K8sPodName": {},
+ "Namespace": {},
+ "NodeName": {},
+ "OTelLib": {},
+ "PodName": {},
+ "Timestamp": {},
+ "Type": {},
+ "UUID": {},
+ "Version": {},
+ "container_gpu_memory_total": {},
+ "container_gpu_memory_used": {},
+ "container_gpu_power_draw": {},
+ "container_gpu_temperature": {},
+ "container_gpu_utilization": {},
+ "container_gpu_memory_utilization": {},
+ "Service":{}
+ },
+ "required": [
+ "ClusterName",
+ "ContainerName",
+ "FullPodName",
+ "GpuDevice",
+ "InstanceId",
+ "Namespace",
+ "NodeName",
+ "PodName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
new file mode 100644
index 000000000..85df0952b
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json
@@ -0,0 +1,44 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "ContainerName": {},
+ "FullPodName": {},
+ "GpuDevice": {},
+ "Hostname": {},
+ "InstanceId": {},
+ "K8sPodName": {},
+ "Namespace": {},
+ "NodeName": {},
+ "OTelLib": {},
+ "PodName": {},
+ "Timestamp": {},
+ "Type": {},
+ "UUID": {},
+ "Version": {},
+ "node_gpu_memory_total": {},
+ "node_gpu_memory_used": {},
+ "node_gpu_power_draw": {},
+ "node_gpu_temperature": {},
+ "node_gpu_utilization": {},
+ "node_gpu_memory_utilization": {},
+ "node_gpu_total": {},
+ "node_gpu_request": {},
+ "node_gpu_list": {},
+ "Service":{}
+ },
+ "required": [
+ "ClusterName",
+ "GpuDevice",
+ "InstanceId",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json
new file mode 100644
index 000000000..4b532094f
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json
@@ -0,0 +1,47 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "ContainerName": {},
+ "FullPodName": {},
+ "GpuDevice": {},
+ "Hostname": {},
+ "InstanceId": {},
+ "K8sPodName": {},
+ "Namespace": {},
+ "NodeName": {},
+ "OTelLib": {},
+ "PodName": {},
+ "Timestamp": {},
+ "Type": {},
+ "UUID": {},
+ "Version": {},
+ "pod_gpu_memory_total": {},
+ "pod_gpu_memory_used": {},
+ "pod_gpu_power_draw": {},
+ "pod_gpu_temperature": {},
+ "pod_gpu_utilization": {},
+ "pod_gpu_memory_utilization": {},
+ "pod_gpu_total": {},
+ "pod_gpu_request": {},
+ "pod_gpu_list": {},
+ "Service":{}
+ },
+ "required": [
+ "ClusterName",
+ "FullPodName",
+ "GpuDevice",
+ "InstanceId",
+ "Namespace",
+ "NodeName",
+ "PodName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go
index 96e832aac..d398b81e1 100644
--- a/test/metric_value_benchmark/eks_resources/util.go
+++ b/test/metric_value_benchmark/eks_resources/util.go
@@ -38,6 +38,14 @@ var (
eksPodSchema string
//go:embed test_schemas/pod_net.json
eksPodNetSchema string
+ //go:embed test_schemas/container_gpu.json
+ eksContainerGpuSchema string
+ //go:embed test_schemas/pod_gpu.json
+ eksPodGpuSchema string
+ //go:embed test_schemas/node_gpu.json
+ eksNodeGpuSchema string
+ //go:embed test_schemas/cluster_gpu.json
+ eksClusterGpuSchema string
EksClusterValidationMap = map[string]string{
"Cluster": eksClusterSchema,
@@ -54,6 +62,10 @@ var (
"NodeNet": eksNodeNetSchema,
"Pod": eksPodSchema,
"PodNet": eksPodNetSchema,
+ "ContainerGPU": eksContainerGpuSchema,
+ "PodGPU": eksPodGpuSchema,
+ "NodeGPU": eksNodeGpuSchema,
+ "ClusterGPU": eksClusterGpuSchema,
}
)
diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go
index c2fab7cac..59ef886b2 100644
--- a/util/awsservice/cloudwatchmetrics.go
+++ b/util/awsservice/cloudwatchmetrics.go
@@ -56,7 +56,7 @@ func ValidateMetric(metricName, namespace string, dimensionsFilter []types.Dimen
return nil
}
-// ValidateMetrics takes the metric name, metric dimension and corresponding namespace that contains the metric
+// ValidateMetricWithTest takes the metric name, metric dimension and corresponding namespace that contains the metric
func ValidateMetricWithTest(t *testing.T, metricName, namespace string, dimensionsFilter []types.DimensionFilter, retries int, retryTime time.Duration) {
var err error
for i := 0; i < retries; i++ {