From c638bfa49b51b75d909ceb1fe301b39703e9b99e Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:56:15 -0400 Subject: [PATCH] add nvidia gpu integ test (#399) --- environment/metadata.go | 5 + generator/test_case_generator.go | 4 + terraform/eks/daemon/gpu/main.tf | 719 ++++++++++++++++++ terraform/eks/daemon/gpu/providers.tf | 17 + terraform/eks/daemon/gpu/variables.tf | 37 + test/gpu/gpu_test.go | 79 ++ test/gpu/nvidia_test.go | 118 +++ test/gpu/resources/config.json | 16 + test/gpu/resources/httpd-ssl.conf | 43 ++ test/gpu/resources/httpd.conf | 101 +++ test/metric/container_insights_util.go | 222 ++++++ test/metric/metric_list_query.go | 30 +- test/metric/stat.go | 3 +- .../eks_daemonset_test.go | 231 ++---- .../test_schemas/cluster_gpu.json | 21 + .../test_schemas/container_gpu.json | 45 ++ .../eks_resources/test_schemas/node_gpu.json | 44 ++ .../eks_resources/test_schemas/pod_gpu.json | 47 ++ .../eks_resources/util.go | 12 + util/awsservice/cloudwatchmetrics.go | 2 +- 20 files changed, 1577 insertions(+), 219 deletions(-) create mode 100644 terraform/eks/daemon/gpu/main.tf create mode 100644 terraform/eks/daemon/gpu/providers.tf create mode 100644 terraform/eks/daemon/gpu/variables.tf create mode 100644 test/gpu/gpu_test.go create mode 100644 test/gpu/nvidia_test.go create mode 100644 test/gpu/resources/config.json create mode 100644 test/gpu/resources/httpd-ssl.conf create mode 100644 test/gpu/resources/httpd.conf create mode 100644 test/metric/container_insights_util.go create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json diff --git a/environment/metadata.go b/environment/metadata.go index c9c0162ab..7a0e1e965 100644 --- a/environment/metadata.go +++ b/environment/metadata.go @@ -43,6 +43,7 @@ type MetaData struct { InstanceId string InstancePlatform string AgentStartCommand string + EksGpuType string } type MetaDataStrings struct { @@ -65,6 +66,7 @@ type MetaDataStrings struct { InstanceId string InstancePlatform string AgentStartCommand string + EksGpuType string } func registerComputeType(dataString *MetaDataStrings) { @@ -94,6 +96,7 @@ func registerECSData(dataString *MetaDataStrings) { func registerEKSData(d *MetaDataStrings) { flag.StringVar(&(d.EKSClusterName), "eksClusterName", "", "EKS cluster name") flag.StringVar(&(d.EksDeploymentStrategy), "eksDeploymentStrategy", "", "Daemon/Replica/Sidecar") + flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/inferentia") } func registerPluginTestsToExecute(dataString *MetaDataStrings) { @@ -210,6 +213,7 @@ func fillEKSData(e *MetaData, data *MetaDataStrings) { } e.EKSClusterName = data.EKSClusterName + e.EksGpuType = data.EksGpuType } func RegisterEnvironmentMetaDataFlags() *MetaDataStrings { registerComputeType(registeredMetaDataStrings) @@ -250,6 +254,7 @@ func GetEnvironmentMetaData() *MetaData { metaDataStorage.InstanceId = registeredMetaDataStrings.InstanceId metaDataStorage.InstancePlatform = registeredMetaDataStrings.InstancePlatform metaDataStorage.AgentStartCommand = registeredMetaDataStrings.AgentStartCommand + metaDataStorage.EksGpuType = registeredMetaDataStrings.EksGpuType return metaDataStorage } diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index e17b63e0d..056670d43 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -214,6 +214,10 @@ var testTypeToTestConfig = map[string][]testConfig{ targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, {testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"}, + { + testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + }, }, "eks_deployment": { {testDir: "./test/metric_value_benchmark"}, diff --git a/terraform/eks/daemon/gpu/main.tf b/terraform/eks/daemon/gpu/main.tf new file mode 100644 index 000000000..7952ef45a --- /dev/null +++ b/terraform/eks/daemon/gpu/main.tf @@ -0,0 +1,719 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = "AL2_x86_64" + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = ["t3.medium"] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and dcgm +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "dcgm-exporter-service.amazon-cloudwatch.svc"] + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + +# dummy daemonset that simulates dcgm-exporter assuming there is only 1 GPU available +resource "kubernetes_daemonset" "exporter" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_config_map.httpdconfig, + ] + metadata { + name = "dcgm-exporter" + namespace = "amazon-cloudwatch" + labels = { + k8s-app = "dcgm-exporter" + } + } + spec { + selector { + match_labels = { + "k8s-app" = "dcgm-exporter" + } + } + template { + metadata { + labels = { + "name" : "dcgm-exporter" + "k8s-app" : "dcgm-exporter" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "dcgm-exporter" + image = "httpd:2.4-alpine" + resources { + limits = { + "cpu" : "50m", + "memory" : "50Mi" + } + requests = { + "cpu" : "50m", + "memory" : "50Mi" + } + } + port { + name = "metrics" + container_port = 9400 + host_port = 9400 + protocol = "TCP" + } + command = [ + "/bin/sh", + "-c", + ] + args = [ + "/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart" + ] + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert" + name = "dcgmtls" + read_only = true + } + volume_mount { + mount_path = "/usr/local/apache2/conf/httpd.conf" + sub_path = "httpd.conf" + name = "httpdconfig" + read_only = true + } + volume_mount { + mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf" + sub_path = "httpd-ssl.conf" + name = "httpdconfig" + read_only = true + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + } + volume { + name = "dcgmtls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "tls.crt" + path = "server.crt" + } + items { + key = "tls.key" + path = "server.key" + } + } + } + volume { + name = "httpdconfig" + config_map { + name = "httpdconfig" + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +resource "kubernetes_service" "exporter" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_daemonset.exporter + ] + metadata { + name = "dcgm-exporter-service" + namespace = "amazon-cloudwatch" + labels = { + "k8s-app" : "dcgm-exporter-service" + } + annotations = { + "prometheus.io/scrape" : "true" + } + } + spec { + type = "ClusterIP" + selector = { + k8s-app = "dcgm-exporter" + } + port { + name = "metrics" + port = 9400 + target_port = 9400 + protocol = "TCP" + } + } +} + +resource "kubernetes_daemonset" "service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_service.exporter + ] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } + spec { + selector { + match_labels = { + "name" : "cloudwatch-agent" + } + } + template { + metadata { + labels = { + "name" : "cloudwatch-agent" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "cwagent" + image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" + image_pull_policy = "Always" + resources { + limits = { + "cpu" : "200m", + "memory" : "200Mi" + } + requests = { + "cpu" : "200m", + "memory" : "200Mi" + } + } + port { + container_port = 25888 + host_port = 25888 + protocol = "UDP" + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + volume_mount { + mount_path = "/etc/cwagentconfig" + name = "cwagentconfig" + } + volume_mount { + mount_path = "/rootfs" + name = "rootfs" + read_only = true + } + volume_mount { + mount_path = "/var/run/docker.sock" + name = "dockersock" + read_only = true + } + volume_mount { + mount_path = "/var/lib/docker" + name = "varlibdocker" + read_only = true + } + volume_mount { + mount_path = "/run/containerd/containerd.sock" + name = "containerdsock" + read_only = true + } + volume_mount { + mount_path = "/sys" + name = "sys" + read_only = true + } + volume_mount { + mount_path = "/dev/disk" + name = "devdisk" + read_only = true + } + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-agent-cert" + name = "agenttls" + read_only = true + } + } + volume { + name = "cwagentconfig" + config_map { + name = "cwagentconfig" + } + } + volume { + name = "rootfs" + host_path { + path = "/" + } + } + volume { + name = "dockersock" + host_path { + path = "/var/run/docker.sock" + } + } + volume { + name = "varlibdocker" + host_path { + path = "/var/lib/docker" + } + } + volume { + name = "containerdsock" + host_path { + path = "/run/containerd/containerd.sock" + } + } + volume { + name = "sys" + host_path { + path = "/sys" + } + } + volume { + name = "devdisk" + host_path { + path = "/dev/disk" + } + } + volume { + name = "agenttls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "ca.crt" + path = "tls-ca.crt" + } + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +########################################## +# Template Files +########################################## +locals { + httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" + httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +data "template_file" "httpd_config" { + template = file(local.httpd_config) + vars = {} +} +data "template_file" "httpd_ssl_config" { + template = file(local.httpd_ssl_config) + vars = {} +} + +resource "kubernetes_config_map" "httpdconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "httpdconfig" + namespace = "amazon-cloudwatch" + } + data = { + "httpd.conf" : data.template_file.httpd_config.rendered + "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered + } +} + +resource "kubernetes_service_account" "cwagentservice" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_daemonset.service, + kubernetes_cluster_role_binding.rolebinding, + kubernetes_service_account.cwagentservice, + ] + provisioner "local-exec" { + command = <<-EOT + echo "Validating EKS metrics/logs for EMF" + cd ../../../.. + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia + EOT + } +} diff --git a/terraform/eks/daemon/gpu/providers.tf b/terraform/eks/daemon/gpu/providers.tf new file mode 100644 index 000000000..9bd2885f5 --- /dev/null +++ b/terraform/eks/daemon/gpu/providers.tf @@ -0,0 +1,17 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/eks/daemon/gpu/variables.tf b/terraform/eks/daemon/gpu/variables.tf new file mode 100644 index 000000000..26a0e6cd0 --- /dev/null +++ b/terraform/eks/daemon/gpu/variables.tf @@ -0,0 +1,37 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "./test/gpu" +} + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.28" +} + +variable "ami_type" { + type = string + default = "AL2_x86_64" +} + +variable "instance_type" { + type = string + default = "g4dn.xlarge" +} \ No newline at end of file diff --git a/test/gpu/gpu_test.go b/test/gpu/gpu_test.go new file mode 100644 index 000000000..352f9cad1 --- /dev/null +++ b/test/gpu/gpu_test.go @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package emf + +import ( + "fmt" + "log" + "testing" + + "github.com/stretchr/testify/suite" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +type GPUTestSuite struct { + suite.Suite + test_runner.TestSuite +} + +func (suite *GPUTestSuite) SetupSuite() { + fmt.Println(">>>> Starting GPU Container Insights TestSuite") +} + +func (suite *GPUTestSuite) TearDownSuite() { + suite.Result.Print() + fmt.Println(">>>> Finished GPU Container Insights TestSuite") +} + +func init() { + environment.RegisterEnvironmentMetaDataFlags() +} + +var ( + eksTestRunners []*test_runner.EKSTestRunner +) + +func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { + if eksTestRunners == nil { + factory := dimension.GetDimensionFactory(*env) + + eksTestRunners = []*test_runner.EKSTestRunner{ + { + Runner: &NvidiaTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_GPU_NVIDIA", env}, + Env: *env, + }, + } + } + return eksTestRunners +} + +func (suite *GPUTestSuite) TestAllInSuite() { + env := environment.GetEnvironmentMetaData() + switch env.ComputeType { + case computetype.EKS: + log.Println("Environment compute type is EKS") + for _, testRunner := range getEksTestRunners(env) { + testRunner.Run(suite, env) + } + default: + return + } + + suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "GPU Container Test Suite Failed") +} + +func (suite *GPUTestSuite) AddToSuiteResult(r status.TestGroupResult) { + suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r) +} + +func TestGPUSuite(t *testing.T) { + suite.Run(t, new(GPUTestSuite)) +} diff --git a/test/gpu/nvidia_test.go b/test/gpu/nvidia_test.go new file mode 100644 index 000000000..ced990b36 --- /dev/null +++ b/test/gpu/nvidia_test.go @@ -0,0 +1,118 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package emf + +import ( + "time" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +const ( + gpuMetricIndicator = "_gpu_" + + containerMemTotal = "container_gpu_memory_total" + containerMemUsed = "container_gpu_memory_used" + containerPower = "container_gpu_power_draw" + containerTemp = "container_gpu_temperature" + containerUtil = "container_gpu_utilization" + containerMemUtil = "container_gpu_memory_utilization" + podMemTotal = "pod_gpu_memory_total" + podMemUsed = "pod_gpu_memory_used" + podPower = "pod_gpu_power_draw" + podTemp = "pod_gpu_temperature" + podUtil = "pod_gpu_utilization" + podMemUtil = "pod_gpu_memory_utilization" + nodeMemTotal = "node_gpu_memory_total" + nodeMemUsed = "node_gpu_memory_used" + nodePower = "node_gpu_power_draw" + nodeTemp = "node_gpu_temperature" + nodeUtil = "node_gpu_utilization" + nodeMemUtil = "node_gpu_memory_utilization" + nodeCountTotal = "node_gpu_total" + nodeCountRequest = "node_gpu_request" + nodeCountLimit = "node_gpu_limit" + clusterCountTotal = "cluster_gpu_total" + clusterCountRequest = "cluster_gpu_request" +) + +var expectedDimsToMetrics = map[string][]string{ + "ClusterName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + //nodeCountTotal, nodeCountRequest, nodeCountLimit, + //clusterCountTotal, clusterCountRequest, + }, + "ClusterName-Namespace": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + //"ClusterName-Namespace-Service": { + // podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + //}, + "ClusterName-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-ContainerName-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-ContainerName-FullPodName-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": { + containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil, + }, + "ClusterName-FullPodName-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-FullPodName-GpuDevice-Namespace-PodName": { + podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil, + }, + "ClusterName-InstanceId-NodeName": { + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + //nodeCountTotal, nodeCountRequest, nodeCountLimit, + }, + "ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": { + nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil, + }, +} + +type NvidiaTestRunner struct { + test_runner.BaseTestRunner + testName string + env *environment.MetaData +} + +var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil) + +func (t *NvidiaTestRunner) Validate() status.TestGroupResult { + var testResults []status.TestResult + testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...) + testResults = append(testResults, metric.ValidateLogs(t.env)) + return status.TestGroupResult{ + Name: t.GetTestName(), + TestResults: testResults, + } +} + +func (t *NvidiaTestRunner) GetTestName() string { + return t.testName +} + +func (t *NvidiaTestRunner) GetAgentConfigFileName() string { + return "" +} + +func (t *NvidiaTestRunner) GetAgentRunDuration() time.Duration { + return 3 * time.Minute +} + +func (t *NvidiaTestRunner) GetMeasuredMetrics() []string { + return nil +} diff --git a/test/gpu/resources/config.json b/test/gpu/resources/config.json new file mode 100644 index 000000000..6f37e43ed --- /dev/null +++ b/test/gpu/resources/config.json @@ -0,0 +1,16 @@ +{ + "agent": { + "metrics_collection_interval": 15, + "run_as_user": "root", + "debug": true, + "logfile": "" + }, + "logs": { + "metrics_collected": { + "kubernetes": { + "enhanced_container_insights": true + } + }, + "force_flush_interval": 5 + } +} \ No newline at end of file diff --git a/test/gpu/resources/httpd-ssl.conf b/test/gpu/resources/httpd-ssl.conf new file mode 100644 index 000000000..8e441a2cd --- /dev/null +++ b/test/gpu/resources/httpd-ssl.conf @@ -0,0 +1,43 @@ +Listen 9400 + +SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES +SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES + +SSLHonorCipherOrder on + +SSLProtocol all -SSLv3 +SSLProxyProtocol all -SSLv3 + +SSLPassPhraseDialog builtin + +SSLSessionCache "shmcb:/usr/local/apache2/logs/ssl_scache(512000)" +SSLSessionCacheTimeout 300 + + + + +DocumentRoot "/usr/local/apache2/htdocs" +ServerName dcgm-exporter-service.amazon-cloudwatch.svc:9400 +ServerAdmin you@example.com +ErrorLog /proc/self/fd/2 +TransferLog /proc/self/fd/1 + +SSLEngine on +SSLCertificateFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.crt" +SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-dcgm-cert/server.key" + + + SSLOptions +StdEnvVars + + + SSLOptions +StdEnvVars + + +BrowserMatch "MSIE [2-5]" \ + nokeepalive ssl-unclean-shutdown \ + downgrade-1.0 force-response-1.0 + +CustomLog /proc/self/fd/1 \ + "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b" + + \ No newline at end of file diff --git a/test/gpu/resources/httpd.conf b/test/gpu/resources/httpd.conf new file mode 100644 index 000000000..058db5063 --- /dev/null +++ b/test/gpu/resources/httpd.conf @@ -0,0 +1,101 @@ + +ServerRoot "/usr/local/apache2" + +#Listen 9400 + +LoadModule mpm_event_module modules/mod_mpm_event.so +LoadModule authn_file_module modules/mod_authn_file.so +LoadModule authn_core_module modules/mod_authn_core.so +LoadModule authz_host_module modules/mod_authz_host.so +LoadModule authz_groupfile_module modules/mod_authz_groupfile.so +LoadModule authz_user_module modules/mod_authz_user.so +LoadModule authz_core_module modules/mod_authz_core.so +LoadModule access_compat_module modules/mod_access_compat.so +LoadModule auth_basic_module modules/mod_auth_basic.so +LoadModule socache_shmcb_module modules/mod_socache_shmcb.so +LoadModule reqtimeout_module modules/mod_reqtimeout.so +LoadModule filter_module modules/mod_filter.so +LoadModule mime_module modules/mod_mime.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule headers_module modules/mod_headers.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule version_module modules/mod_version.so +LoadModule ssl_module modules/mod_ssl.so +LoadModule unixd_module modules/mod_unixd.so +LoadModule status_module modules/mod_status.so +LoadModule autoindex_module modules/mod_autoindex.so +LoadModule dir_module modules/mod_dir.so +LoadModule alias_module modules/mod_alias.so + + +User www-data +Group www-data + + + + AllowOverride none + Require all denied + + +DocumentRoot "/usr/local/apache2/htdocs" + + Options Indexes FollowSymLinks + AllowOverride None + Require all granted + + + + DirectoryIndex index.html + + + + Require all denied + + +ErrorLog /proc/self/fd/2 + +LogLevel warn + + + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + + # You need to enable mod_logio.c to use %I and %O + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio + + + CustomLog /proc/self/fd/1 common + + + + ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/" + + + + AllowOverride None + Options None + Require all granted + + + + RequestHeader unset Proxy early + + + + TypesConfig conf/mime.types + AddType application/x-compress .Z + AddType application/x-gzip .gz .tgz + + + +Include conf/extra/proxy-html.conf + + +# Secure (SSL/TLS) connections +Include conf/extra/httpd-ssl.conf + +SSLRandomSeed startup builtin +SSLRandomSeed connect builtin + \ No newline at end of file diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go new file mode 100644 index 000000000..140c839c1 --- /dev/null +++ b/test/metric/container_insights_util.go @@ -0,0 +1,222 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package metric + +import ( + "encoding/json" + "errors" + "fmt" + "log" + "math/rand" + "sort" + "strings" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" +) + +const ( + dimDelimiter = "-" + ContainerInsightsNamespace = "ContainerInsights" +) + +type dimToMetrics struct { + // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace + dimStr string + // metric names to their dimensions with values. Dimension sets will be used for metric data validations + metrics map[string][][]types.Dimension +} + +func ValidateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult { + var results []status.TestResult + dimsToMetrics := getMetricsInClusterDimension(env, metricFilter) + for dims, metrics := range expectedDimsToMetrics { + var actual map[string][][]types.Dimension + for _, dtm := range dimsToMetrics { + if dtm.dimStr == dims { + actual = dtm.metrics + break + } + } + if len(actual) < 1 { + results = append(results, status.TestResult{ + Name: dims, + Status: status.FAILED, + }) + log.Printf("ValidateMetrics failed with missing dimension set: %s", dims) + // keep testing other dims or fail early? + continue + } + results = append(results, validateMetricsAvailability(dims, metrics, actual)) + for _, m := range metrics { + // pick a random dimension set to test metric data OR test all dimension sets which might be overkill + randIdx := rand.Intn(len(actual[m])) + results = append(results, validateMetricValue(m, actual[m][randIdx])) + } + } + return results +} + +func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} { + listFetcher := Fetcher{} + log.Printf("Fetching by cluster dimension") + dims := []types.Dimension{ + { + Name: aws.String("ClusterName"), + Value: aws.String(env.EKSClusterName), + }, + } + metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) + if err != nil { + log.Println("failed to fetch metric list", err) + return nil + } + if len(metrics) < 1 { + log.Println("cloudwatch metric list is empty") + return nil + } + + var results []dimToMetrics + for _, m := range metrics { + // filter by metric name filter + if metricFilter != "" && !strings.Contains(*m.MetricName, metricFilter) { + continue + } + var dims []string + for _, d := range m.Dimensions { + dims = append(dims, *d.Name) + } + sort.Sort(sort.StringSlice(dims)) + dimsKey := strings.Join(dims, dimDelimiter) + log.Printf("processing dims: %s", dimsKey) + + var dtm dimToMetrics + for _, ele := range results { + if ele.dimStr == dimsKey { + dtm = ele + break + } + } + if dtm.dimStr == "" { + dtm = dimToMetrics{ + dimStr: dimsKey, + metrics: make(map[string][][]types.Dimension), + } + results = append(results, dtm) + } + dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions) + } + return results +} + +func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult { + testResult := status.TestResult{ + Name: dims, + Status: status.FAILED, + } + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) + if compareMetrics(expected, actual) { + testResult.Status = status.SUCCESSFUL + } else { + log.Printf("validateMetricsAvailability failed for %s", dims) + } + return testResult +} + +func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool { + if len(expected) != len(actual) { + return false + } + + for _, key := range expected { + if _, ok := actual[key]; !ok { + return false + } + } + return true +} + +func validateMetricValue(name string, dims []types.Dimension) status.TestResult { + log.Printf("validateMetricValue with metric: %s", name) + testResult := status.TestResult{ + Name: name, + Status: status.FAILED, + } + valueFetcher := MetricValueFetcher{} + values, err := valueFetcher.Fetch(ContainerInsightsNamespace, name, dims, SAMPLE_COUNT, MinuteStatPeriod) + if err != nil { + log.Println("failed to fetch metrics", err) + return testResult + } + + if !IsAllValuesGreaterThanOrEqualToExpectedValue(name, values, 0) { + return testResult + } + + testResult.Status = status.SUCCESSFUL + return testResult +} + +func ValidateLogs(env *environment.MetaData) status.TestResult { + testResult := status.TestResult{ + Name: "emf-logs", + Status: status.FAILED, + } + + end := time.Now() + start := end.Add(time.Duration(-3) * time.Minute) + group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName) + + // need to get the instances used for the EKS cluster + eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName) + if err != nil { + log.Println("failed to get EKS instances", err) + return testResult + } + + for _, instance := range eKSInstances { + stream := *instance.InstanceName + err = awsservice.ValidateLogs( + group, + stream, + &start, + &end, + awsservice.AssertLogsNotEmpty(), + //awsservice.AssertNoDuplicateLogs(), + awsservice.AssertPerLog( + awsservice.AssertLogSchema(func(message string) (string, error) { + var eksClusterType awsservice.EKSClusterType + innerErr := json.Unmarshal([]byte(message), &eksClusterType) + if innerErr != nil { + return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) + } + + //log.Printf("eksClusterType is: %s", eksClusterType.Type) + jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] + if !ok { + return "", errors.New("invalid cluster type provided") + } + return jsonSchema, nil + }), + awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)), + ), + ) + + if err != nil { + log.Printf("log validation (%s/%s) failed: %v", group, stream, err) + return testResult + } + } + + testResult.Status = status.SUCCESSFUL + return testResult +} diff --git a/test/metric/metric_list_query.go b/test/metric/metric_list_query.go index 8a3804efe..afe07f6c1 100644 --- a/test/metric/metric_list_query.go +++ b/test/metric/metric_list_query.go @@ -17,10 +17,10 @@ import ( "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" ) -type MetricListFetcher struct { +type Fetcher struct { } -func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) { +func (n *Fetcher) Fetch(namespace, metricName string, dimensions []types.Dimension) ([]types.Metric, error) { var dims []types.DimensionFilter for _, dim := range dimensions { dims = append(dims, types.DimensionFilter{ @@ -56,29 +56,3 @@ func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []typ log.Printf("total number of metrics fetched: %v", len(metrics)) return metrics, nil } - -func (n *MetricListFetcher) FetchByDimension(namespace string, dimensions []types.Dimension) ([]types.Metric, error) { - var dims []types.DimensionFilter - for _, dim := range dimensions { - dims = append(dims, types.DimensionFilter{ - Name: dim.Name, - Value: dim.Value, - }) - } - - listMetricInput := cloudwatch.ListMetricsInput{ - Namespace: aws.String(namespace), - Dimensions: dims, - } - - log.Printf("Metric data input: namespace %v, dimensions %v", namespace, fmt.Sprint(&dims)) - - output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput) - if err != nil { - return nil, fmt.Errorf("Error getting metric data %v", err) - } - - log.Printf("Metrics fetched : %v", output.Metrics) - - return output.Metrics, nil -} diff --git a/test/metric/stat.go b/test/metric/stat.go index d633985d3..a6dc5d25e 100644 --- a/test/metric/stat.go +++ b/test/metric/stat.go @@ -13,6 +13,5 @@ const ( MAXUMUM Statistics = "Maxmimum" SUM Statistics = "Sum" HighResolutionStatPeriod = 10 - - MinuteStatPeriod = 60 + MinuteStatPeriod = 60 ) diff --git a/test/metric_value_benchmark/eks_daemonset_test.go b/test/metric_value_benchmark/eks_daemonset_test.go index 9572bd8bd..d7eb996f2 100644 --- a/test/metric_value_benchmark/eks_daemonset_test.go +++ b/test/metric_value_benchmark/eks_daemonset_test.go @@ -6,28 +6,22 @@ package metric_value_benchmark import ( - "encoding/json" - "errors" - "fmt" "log" - "math/rand" - "sort" - "strings" "time" - "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" + "golang.org/x/exp/slices" "github.com/aws/amazon-cloudwatch-agent-test/environment" "github.com/aws/amazon-cloudwatch-agent-test/test/metric" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" "github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" - "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" ) -const containerInsightsNamespace = "ContainerInsights" -const gpuMetricIndicator = "_gpu_" +// list of metrics with more dimensions e.g. PodName and Namespace +var metricsWithMoreDimensions = []string{"pod_number_of_container_restarts"} type EKSDaemonTestRunner struct { test_runner.BaseTestRunner @@ -37,151 +31,66 @@ type EKSDaemonTestRunner struct { func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult { var testResults []status.TestResult - testResults = append(testResults, validateMetrics(e.env, gpuMetricIndicator, eks_resources.GetExpectedDimsToMetrics(e.env))...) - testResults = append(testResults, e.validateLogs(e.env)) + testResults = append(testResults, metric.ValidateMetrics(e.env, "", eks_resources.GetExpectedDimsToMetrics(e.env))...) + metrics := e.GetMeasuredMetrics() + for _, name := range metrics { + testResults = append(testResults, e.validateInstanceMetrics(name)) + } + testResults = append(testResults, metric.ValidateLogs(e.env)) return status.TestGroupResult{ Name: e.GetTestName(), TestResults: testResults, } } -const ( - dimDelimiter = "-" - ContainerInsightsNamespace = "ContainerInsights" -) - -type dimToMetrics struct { - // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace - dimStr string - // metric names to their dimensions with values. Dimension sets will be used for metric data validations - metrics map[string][][]types.Dimension -} - -func validateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult { - var results []status.TestResult - dimsToMetrics := getMetricsInClusterDimension(env, metricFilter) - //loops through each dimension set and checks if they exit in the cluster(fails if it doesn't) - for dims, metrics := range expectedDimsToMetrics { - var actual map[string][][]types.Dimension - //looping through dtms until we find the dimension string equal to the one in the hard coded map - for _, dtm := range dimsToMetrics { - log.Printf("dtm: %s vs dims %s", dtm.dimStr, dims) //testing purposes - if dtm.dimStr == dims { - actual = dtm.metrics - break - } - } - //if there are no metrics for the dimension set, we fail the test - if len(actual) < 1 { - results = append(results, status.TestResult{ - Name: dims, - Status: status.FAILED, - }) - log.Printf("ValidateMetrics failed with missing dimension set: %s", dims) - // keep testing other dims or fail early? - continue - } - //verifies length of metrics for dimension set - results = append(results, validateMetricsAvailability(dims, metrics, actual)) - for _, m := range metrics { - // picking a random dimension set to test metric data so we don't have to test every dimension set - randIdx := rand.Intn(len(actual[m])) - //verifys values of metrics - results = append(results, validateMetricValue(m, actual[m][randIdx])) - } +func (e *EKSDaemonTestRunner) validateInstanceMetrics(name string) status.TestResult { + testResult := status.TestResult{ + Name: name, + Status: status.FAILED, } - return results -} -// Fetches all metrics in cluster -func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} { - listFetcher := metric.MetricListFetcher{} - log.Printf("Fetching by cluster dimension") - dims := []types.Dimension{ + dims, failed := e.DimensionFactory.GetDimensions([]dimension.Instruction{ { - Name: aws.String("ClusterName"), - Value: aws.String(env.EKSClusterName), + Key: "ClusterName", + Value: dimension.UnknownDimensionValue(), }, - } - metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) - if err != nil { - log.Println("failed to fetch metric list", err) - return nil - } - if len(metrics) < 1 { - log.Println("cloudwatch metric list is empty") - return nil + }) + if len(failed) > 0 { + log.Println("failed to get dimensions") + return testResult } - var results []dimToMetrics - for _, m := range metrics { - // filter by metric name filter(skip gpu validation) - if metricFilter != "" && strings.Contains(*m.MetricName, metricFilter) { - continue - } - var dims []string - for _, d := range m.Dimensions { - dims = append(dims, *d.Name) + // get list of metrics that has more dimensions for container insights + // this is to avoid adding more dimension provider for non-trivial dimensions e.g. PodName + listFetcher := metric.Fetcher{} + if slices.Contains(metricsWithMoreDimensions, name) { + metrics, err := listFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims) + if err != nil { + log.Println("failed to fetch metric list", err) + return testResult } - sort.Sort(sort.StringSlice(dims)) //what's the point of sorting? - dimsKey := strings.Join(dims, dimDelimiter) - log.Printf("processing dims: %s", dimsKey) - var dtm dimToMetrics - for _, ele := range results { - if ele.dimStr == dimsKey { - dtm = ele - break - } - } - if dtm.dimStr == "" { - dtm = dimToMetrics{ - dimStr: dimsKey, - metrics: make(map[string][][]types.Dimension), - } - results = append(results, dtm) + if len(metrics) < 1 { + log.Println("metric list is empty") + return testResult } - dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions) - } - return results -} - -// Check if all metrics from cluster matches hard coded map -func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult { - testResult := status.TestResult{ - Name: dims, - Status: status.FAILED, - } - log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) - if compareMetrics(expected, actual) { - testResult.Status = status.SUCCESSFUL - } else { - log.Printf("validateMetricsAvailability failed for %s", dims) - } - return testResult -} -func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool { - if len(expected) != len(actual) { - return false - } + // just verify 1 of returned metrics for values + for _, dim := range metrics[0].Dimensions { + // skip since it's provided by dimension provider + if *dim.Name == "ClusterName" { + continue + } - for _, key := range expected { - if _, ok := actual[key]; !ok { - return false + dims = append(dims, types.Dimension{ + Name: dim.Name, + Value: dim.Value, + }) } } - return true -} -func validateMetricValue(name string, dims []types.Dimension) status.TestResult { - log.Printf("validateMetricValue with metric: %s", name) - testResult := status.TestResult{ - Name: name, - Status: status.FAILED, - } valueFetcher := metric.MetricValueFetcher{} - values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.SAMPLE_COUNT, metric.MinuteStatPeriod) + values, err := valueFetcher.Fetch(metric.ContainerInsightsNamespace, name, dims, metric.AVERAGE, metric.HighResolutionStatPeriod) if err != nil { log.Println("failed to fetch metrics", err) return testResult @@ -195,60 +104,6 @@ func validateMetricValue(name string, dims []types.Dimension) status.TestResult return testResult } -func (e *EKSDaemonTestRunner) validateLogs(env *environment.MetaData) status.TestResult { - testResult := status.TestResult{ - Name: "emf-logs", - Status: status.FAILED, - } - - now := time.Now() - group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName) - - // need to get the instances used for the EKS cluster - eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName) - if err != nil { - log.Println("failed to get EKS instances", err) - return testResult - } - - for _, instance := range eKSInstances { - stream := *instance.InstanceName - err = awsservice.ValidateLogs( - group, - stream, - nil, - &now, - awsservice.AssertLogsNotEmpty(), - awsservice.AssertNoDuplicateLogs(), - awsservice.AssertPerLog( - awsservice.AssertLogSchema(func(message string) (string, error) { - var eksClusterType awsservice.EKSClusterType - innerErr := json.Unmarshal([]byte(message), &eksClusterType) - if innerErr != nil { - return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) - } - - log.Printf("eksClusterType is: %s", eksClusterType.Type) - jsonSchema, ok := eks_resources.EksClusterValidationMap[eksClusterType.Type] - if !ok { - return "", errors.New("invalid cluster type provided") - } - return jsonSchema, nil - }), - awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)), - ), - ) - - if err != nil { - log.Printf("log validation (%s/%s) failed: %v", group, stream, err) - return testResult - } - } - - testResult.Status = status.SUCCESSFUL - return testResult -} - func (e *EKSDaemonTestRunner) GetTestName() string { return "EKSContainerInstance" } diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json new file mode 100644 index 000000000..5b14e3fb1 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_gpu.json @@ -0,0 +1,21 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "cluster_gpu_total": {}, + "cluster_gpu_request": {}, + }, + "required": [ + "ClusterName", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json new file mode 100644 index 000000000..99c56e87f --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json @@ -0,0 +1,45 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "ContainerName": {}, + "FullPodName": {}, + "GpuDevice": {}, + "Hostname": {}, + "InstanceId": {}, + "K8sPodName": {}, + "Namespace": {}, + "NodeName": {}, + "OTelLib": {}, + "PodName": {}, + "Timestamp": {}, + "Type": {}, + "UUID": {}, + "Version": {}, + "container_gpu_memory_total": {}, + "container_gpu_memory_used": {}, + "container_gpu_power_draw": {}, + "container_gpu_temperature": {}, + "container_gpu_utilization": {}, + "container_gpu_memory_utilization": {}, + "Service":{} + }, + "required": [ + "ClusterName", + "ContainerName", + "FullPodName", + "GpuDevice", + "InstanceId", + "Namespace", + "NodeName", + "PodName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json new file mode 100644 index 000000000..85df0952b --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json @@ -0,0 +1,44 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "ContainerName": {}, + "FullPodName": {}, + "GpuDevice": {}, + "Hostname": {}, + "InstanceId": {}, + "K8sPodName": {}, + "Namespace": {}, + "NodeName": {}, + "OTelLib": {}, + "PodName": {}, + "Timestamp": {}, + "Type": {}, + "UUID": {}, + "Version": {}, + "node_gpu_memory_total": {}, + "node_gpu_memory_used": {}, + "node_gpu_power_draw": {}, + "node_gpu_temperature": {}, + "node_gpu_utilization": {}, + "node_gpu_memory_utilization": {}, + "node_gpu_total": {}, + "node_gpu_request": {}, + "node_gpu_list": {}, + "Service":{} + }, + "required": [ + "ClusterName", + "GpuDevice", + "InstanceId", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json new file mode 100644 index 000000000..4b532094f --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json @@ -0,0 +1,47 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "ContainerName": {}, + "FullPodName": {}, + "GpuDevice": {}, + "Hostname": {}, + "InstanceId": {}, + "K8sPodName": {}, + "Namespace": {}, + "NodeName": {}, + "OTelLib": {}, + "PodName": {}, + "Timestamp": {}, + "Type": {}, + "UUID": {}, + "Version": {}, + "pod_gpu_memory_total": {}, + "pod_gpu_memory_used": {}, + "pod_gpu_power_draw": {}, + "pod_gpu_temperature": {}, + "pod_gpu_utilization": {}, + "pod_gpu_memory_utilization": {}, + "pod_gpu_total": {}, + "pod_gpu_request": {}, + "pod_gpu_list": {}, + "Service":{} + }, + "required": [ + "ClusterName", + "FullPodName", + "GpuDevice", + "InstanceId", + "Namespace", + "NodeName", + "PodName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index 96e832aac..d398b81e1 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -38,6 +38,14 @@ var ( eksPodSchema string //go:embed test_schemas/pod_net.json eksPodNetSchema string + //go:embed test_schemas/container_gpu.json + eksContainerGpuSchema string + //go:embed test_schemas/pod_gpu.json + eksPodGpuSchema string + //go:embed test_schemas/node_gpu.json + eksNodeGpuSchema string + //go:embed test_schemas/cluster_gpu.json + eksClusterGpuSchema string EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, @@ -54,6 +62,10 @@ var ( "NodeNet": eksNodeNetSchema, "Pod": eksPodSchema, "PodNet": eksPodNetSchema, + "ContainerGPU": eksContainerGpuSchema, + "PodGPU": eksPodGpuSchema, + "NodeGPU": eksNodeGpuSchema, + "ClusterGPU": eksClusterGpuSchema, } ) diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go index c2fab7cac..59ef886b2 100644 --- a/util/awsservice/cloudwatchmetrics.go +++ b/util/awsservice/cloudwatchmetrics.go @@ -56,7 +56,7 @@ func ValidateMetric(metricName, namespace string, dimensionsFilter []types.Dimen return nil } -// ValidateMetrics takes the metric name, metric dimension and corresponding namespace that contains the metric +// ValidateMetricWithTest takes the metric name, metric dimension and corresponding namespace that contains the metric func ValidateMetricWithTest(t *testing.T, metricName, namespace string, dimensionsFilter []types.DimensionFilter, retries int, retryTime time.Duration) { var err error for i := 0; i < retries; i++ {