From f7c5fd3cdf59ac6e413bea016467cf82ae3d407c Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:22:21 -0500 Subject: [PATCH] Add integration test for EFA (#431) --- generator/test_case_generator.go | 4 + terraform/eks/daemon/efa/main.tf | 530 ++++++++++++++++++ terraform/eks/daemon/efa/providers.tf | 30 + terraform/eks/daemon/efa/variables.tf | 38 ++ test/efa/efa_test.go | 170 ++++++ test/efa/resources/config.json | 16 + .../test_schemas/container_efa.json | 35 ++ .../eks_resources/test_schemas/node_efa.json | 35 ++ .../eks_resources/test_schemas/pod_efa.json | 35 ++ .../eks_resources/util.go | 9 + 10 files changed, 902 insertions(+) create mode 100644 terraform/eks/daemon/efa/main.tf create mode 100644 terraform/eks/daemon/efa/providers.tf create mode 100644 terraform/eks/daemon/efa/variables.tf create mode 100644 test/efa/efa_test.go create mode 100644 test/efa/resources/config.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/container_efa.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/node_efa.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/pod_efa.json diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 8e2a67f9f..a811e7c5c 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -235,6 +235,10 @@ var testTypeToTestConfig = map[string][]testConfig{ testDir: "./test/entity", terraformDir: "terraform/eks/daemon/entity", targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, + { + testDir: "./test/efa", terraformDir: "terraform/eks/daemon/efa", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + }, }, "eks_deployment": { {testDir: "./test/metric_value_benchmark"}, diff --git a/terraform/eks/daemon/efa/main.tf b/terraform/eks/daemon/efa/main.tf new file mode 100644 index 000000000..38bdfd306 --- /dev/null +++ b/terraform/eks/daemon/efa/main.tf @@ -0,0 +1,530 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = var.ami_type + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = [var.instance_type] + tags = { + Owner = "default" + "kubernetes.io/cluster/${aws_eks_cluster.this.name}" = "owned" + } + + labels = { + "vpc.amazonaws.com/efa.present" = "true" + "nvidia.com/gpu.present" = "true" + } + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + +resource "helm_release" "efa_plugin" { + depends_on = [ + kubernetes_namespace.namespace, + aws_eks_node_group.this, + ] + name = "aws-efa-k8s-device-plugin" + chart = "aws-efa-k8s-device-plugin" + repository = "https://aws.github.io/eks-charts" + namespace = "amazon-cloudwatch" + create_namespace = true + wait = true + values = [ + <<-EOT + tolerations: + - operator: Exists + EOT + ] +} +resource "helm_release" "nvidia_device_plugin" { + depends_on = [ + kubernetes_namespace.namespace, + aws_eks_node_group.this, + ] + name = "nvidia-device-plugin" + repository = "https://nvidia.github.io/k8s-device-plugin" + chart = "nvidia-device-plugin" + version = "0.17.0" + namespace = "amazon-cloudwatch" + create_namespace = true + wait = true + values = [ + <<-EOT + tolerations: + - operator: Exists + EOT + ] +} + +resource "kubernetes_daemonset" "service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + helm_release.efa_plugin, + helm_release.nvidia_device_plugin + ] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } + spec { + selector { + match_labels = { + "name" : "cloudwatch-agent" + } + } + template { + metadata { + labels = { + "name" : "cloudwatch-agent" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + toleration { + operator = "Exists" + } + init_container { + name = "efainit" + image = "ubuntu:latest" + command = ["/bin/bash", "-c"] + args = [ + # creates EFA data files under an expected location by EFA receiver infiniband/{deviceName}/ports/{port}/hw_counters + # also changes mod on infiniband folder as the receiver expects the location is only writable by the root (sysfs) + "cd /work-dir && mkdir -p infiniband/device1/ports/1/hw_counters && echo 10 > infiniband/device1/ports/1/hw_counters/rdma_read_bytes && echo 20 > infiniband/device1/ports/1/hw_counters/rdma_write_bytes && echo 30 > infiniband/device1/ports/1/hw_counters/rdma_write_recv_bytes && echo 100 > infiniband/device1/ports/1/hw_counters/rx_bytes && echo 200 > infiniband/device1/ports/1/hw_counters/rx_drops && echo 300 > infiniband/device1/ports/1/hw_counters/tx_bytes && chmod 755 infiniband" + ] + volume_mount { + name = "sysefa" + mount_path = "/work-dir" + } + } + container { + name = "cwagent" + image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" + image_pull_policy = "Always" + security_context { + privileged = true + } + resources { + limits = { + "cpu" : "200m", + "memory" : "200Mi" + } + requests = { + "cpu" : "200m", + "memory" : "200Mi" + } + } + port { + container_port = 25888 + host_port = 25888 + protocol = "UDP" + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + volume_mount { + mount_path = "/etc/cwagentconfig" + name = "cwagentconfig" + } + volume_mount { + mount_path = "/rootfs" + name = "rootfs" + read_only = true + } + volume_mount { + mount_path = "/var/run/docker.sock" + name = "dockersock" + read_only = true + } + volume_mount { + mount_path = "/var/lib/docker" + name = "varlibdocker" + read_only = true + } + volume_mount { + mount_path = "/run/containerd/containerd.sock" + name = "containerdsock" + read_only = true + } + volume_mount { + mount_path = "/sys" + name = "sys" + read_only = true + } + volume_mount { + mount_path = "/dev/disk" + name = "devdisk" + read_only = true + } + volume_mount { + mount_path = "/var/lib/kubelet/pod-resources" + name = "kubelet-podresources" + read_only = true + } + volume_mount { + mount_path = "/sys/class" + name = "sysefa" + } + } + volume { + name = "cwagentconfig" + config_map { + name = "cwagentconfig" + } + } + volume { + name = "rootfs" + host_path { + path = "/" + } + } + volume { + name = "dockersock" + host_path { + path = "/var/run/docker.sock" + } + } + volume { + name = "varlibdocker" + host_path { + path = "/var/lib/docker" + } + } + volume { + name = "containerdsock" + host_path { + path = "/run/containerd/containerd.sock" + } + } + volume { + name = "sys" + host_path { + path = "/sys" + } + } + volume { + name = "devdisk" + host_path { + path = "/dev/disk" + } + } + volume { + name = "kubelet-podresources" + host_path { + path = "/var/lib/kubelet/pod-resources" + } + } + volume { + name = "sysefa" + empty_dir {} + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + host_network = true + dns_policy = "ClusterFirstWithHostNet" + } + } + } +} + +########################################## +# Template Files +########################################## +locals { + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +resource "kubernetes_service_account" "cwagentservice" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_daemonset.service, + kubernetes_cluster_role_binding.rolebinding, + kubernetes_service_account.cwagentservice, + ] + + provisioner "local-exec" { + command = <<-EOT + cd ../../../.. + i=0 + while [ $i -lt 10 ]; do + i=$((i+1)) + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON && exit 0 + sleep 60 + done + exit 1 + EOT + } +} \ No newline at end of file diff --git a/terraform/eks/daemon/efa/providers.tf b/terraform/eks/daemon/efa/providers.tf new file mode 100644 index 000000000..bfcefae96 --- /dev/null +++ b/terraform/eks/daemon/efa/providers.tf @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token + } +} \ No newline at end of file diff --git a/terraform/eks/daemon/efa/variables.tf b/terraform/eks/daemon/efa/variables.tf new file mode 100644 index 000000000..39678c3fc --- /dev/null +++ b/terraform/eks/daemon/efa/variables.tf @@ -0,0 +1,38 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +variable "region" { + type = string + default = "us-west-2" +} + +variable "test_dir" { + type = string + default = "./test/efa" +} + +variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" +} + +variable "cwagent_image_tag" { + type = string + default = "latest" +} + +variable "k8s_version" { + type = string + default = "1.30" +} + +variable "ami_type" { + type = string + default = "AL2023_x86_64_NVIDIA" +} + +# NCCL only works on certain P instance types https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html +variable "instance_type" { + type = string + default = "g4dn.8xlarge" +} \ No newline at end of file diff --git a/test/efa/efa_test.go b/test/efa/efa_test.go new file mode 100644 index 000000000..91e8eaacf --- /dev/null +++ b/test/efa/efa_test.go @@ -0,0 +1,170 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package emf + +import ( + "fmt" + "log" + "testing" + "time" + + "github.com/stretchr/testify/suite" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +const ( + efaMetricIndicator = "_efa_" + + containerEfaRxBytes = "container_efa_rx_bytes" + containerEfaTxBytes = "container_efa_tx_bytes" + containerEfaRxDropped = "container_efa_rx_dropped" + containerEfaRdmaReadBytes = "container_efa_rdma_read_bytes" + containerEfaRdmaWriteBytes = "container_efa_rdma_write_bytes" + containerEfaRdmaWriteRecvBytes = "container_efa_rdma_write_recv_bytes" + + podEfaRxBytes = "pod_efa_rx_bytes" + podEfaTxBytes = "pod_efa_tx_bytes" + podEfaRxDropped = "pod_efa_rx_dropped" + podEfaRdmaReadBytes = "pod_efa_rdma_read_bytes" + podEfaRdmaWriteBytes = "pod_efa_rdma_write_bytes" + podEfaRdmaWriteRecvBytes = "pod_efa_rdma_write_recv_bytes" + + nodeEfaRxBytes = "node_efa_rx_bytes" + nodeEfaTxBytes = "node_efa_tx_bytes" + nodeEfaRxDropped = "node_efa_rx_dropped" + nodeEfaRdmaReadBytes = "node_efa_rdma_read_bytes" + nodeEfaRdmaWriteBytes = "node_efa_rdma_write_bytes" + nodeEfaRdmaWriteRecvBytes = "node_efa_rdma_write_recv_bytes" +) + +var expectedDimsToMetricsIntegTest = map[string][]string{ + "ClusterName": { + //containerEfaRxBytes, containerEfaTxBytes, containerEfaRxDropped, containerEfaRdmaReadBytes, containerEfaRdmaWriteBytes, containerEfaRdmaWriteRecvBytes, + //podEfaRxBytes, podEfaTxBytes, podEfaRxDropped, podEfaRdmaReadBytes, podEfaRdmaWriteBytes, podEfaRdmaWriteRecvBytes, + nodeEfaRxBytes, nodeEfaTxBytes, nodeEfaRxDropped, nodeEfaRdmaReadBytes, nodeEfaRdmaWriteBytes, nodeEfaRdmaWriteRecvBytes, + }, + //"ClusterName-Namespace-PodName-ContainerName": { + // containerEfaRxBytes, containerEfaTxBytes, containerEfaRxDropped, containerEfaRdmaReadBytes, containerEfaRdmaWriteBytes, containerEfaRdmaWriteRecvBytes, + //}, + //"ClusterName-Namespace-PodName-FullPodName-ContainerName": { + // containerEfaRxBytes, containerEfaTxBytes, containerEfaRxDropped, containerEfaRdmaReadBytes, containerEfaRdmaWriteBytes, containerEfaRdmaWriteRecvBytes, + //}, + //"ClusterName-Namespace": { + // podEfaRxBytes, podEfaTxBytes, podEfaRxDropped, podEfaRdmaReadBytes, podEfaRdmaWriteBytes, podEfaRdmaWriteRecvBytes, + //}, + //"ClusterName-Namespace-Service": { + // podEfaRxBytes, podEfaTxBytes, podEfaRxDropped, podEfaRdmaReadBytes, podEfaRdmaWriteBytes, podEfaRdmaWriteRecvBytes, + //}, + //"ClusterName-Namespace-PodName": { + // podEfaRxBytes, podEfaTxBytes, podEfaRxDropped, podEfaRdmaReadBytes, podEfaRdmaWriteBytes, podEfaRdmaWriteRecvBytes, + //}, + //"ClusterName-Namespace-PodName-FullPodName": { + // podEfaRxBytes, podEfaTxBytes, podEfaRxDropped, podEfaRdmaReadBytes, podEfaRdmaWriteBytes, podEfaRdmaWriteRecvBytes, + //}, + "ClusterName-InstanceId-NodeName": { + nodeEfaRxBytes, nodeEfaTxBytes, nodeEfaRxDropped, nodeEfaRdmaReadBytes, nodeEfaRdmaWriteBytes, nodeEfaRdmaWriteRecvBytes, + }, +} + +type EfaTestSuite struct { + suite.Suite + test_runner.TestSuite +} + +func (suite *EfaTestSuite) SetupSuite() { + fmt.Println(">>>> Starting EFA Container Insights TestSuite") +} + +func (suite *EfaTestSuite) TearDownSuite() { + suite.Result.Print() + fmt.Println(">>>> Finished EFA Container Insights TestSuite") +} + +func init() { + environment.RegisterEnvironmentMetaDataFlags() +} + +var ( + eksTestRunners []*test_runner.EKSTestRunner +) + +func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { + if eksTestRunners == nil { + factory := dimension.GetDimensionFactory(*env) + + eksTestRunners = []*test_runner.EKSTestRunner{ + { + Runner: &EfaTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_EFA", env}, + Env: *env, + }, + } + } + return eksTestRunners +} + +func (suite *EfaTestSuite) TestAllInSuite() { + env := environment.GetEnvironmentMetaData() + switch env.ComputeType { + case computetype.EKS: + log.Println("Environment compute type is EKS") + for _, testRunner := range getEksTestRunners(env) { + testRunner.Run(suite, env) + } + default: + return + } + + suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "EFA Container Test Suite Failed") +} + +func (suite *EfaTestSuite) AddToSuiteResult(r status.TestGroupResult) { + suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r) +} + +func TestEfaSuite(t *testing.T) { + suite.Run(t, new(EfaTestSuite)) +} + +type EfaTestRunner struct { + test_runner.BaseTestRunner + testName string + env *environment.MetaData +} + +var _ test_runner.ITestRunner = (*EfaTestRunner)(nil) + +func (t *EfaTestRunner) Validate() status.TestGroupResult { + var testResults []status.TestResult + expectedDimsToMetrics := expectedDimsToMetricsIntegTest + testResults = append(testResults, metric.ValidateMetrics(t.env, efaMetricIndicator, expectedDimsToMetrics)...) + testResults = append(testResults, metric.ValidateLogs(t.env)) + return status.TestGroupResult{ + Name: t.GetTestName(), + TestResults: testResults, + } +} + +func (t *EfaTestRunner) GetTestName() string { + return t.testName +} + +func (t *EfaTestRunner) GetAgentConfigFileName() string { + return "" +} + +func (t *EfaTestRunner) GetAgentRunDuration() time.Duration { + return 3 * time.Minute +} + +func (t *EfaTestRunner) GetMeasuredMetrics() []string { + return nil +} diff --git a/test/efa/resources/config.json b/test/efa/resources/config.json new file mode 100644 index 000000000..6f37e43ed --- /dev/null +++ b/test/efa/resources/config.json @@ -0,0 +1,16 @@ +{ + "agent": { + "metrics_collection_interval": 15, + "run_as_user": "root", + "debug": true, + "logfile": "" + }, + "logs": { + "metrics_collected": { + "kubernetes": { + "enhanced_container_insights": true + } + }, + "force_flush_interval": 5 + } +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/container_efa.json b/test/metric_value_benchmark/eks_resources/test_schemas/container_efa.json new file mode 100644 index 000000000..6560af558 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/container_efa.json @@ -0,0 +1,35 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "EfaDevice": {}, + "InstanceId": {}, + "InstanceType": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "kubernetes": {}, + "node_efa_rdma_read_bytes": {}, + "node_efa_rdma_write_bytes": {}, + "node_efa_rdma_write_recv_bytes": {}, + "node_efa_rx_bytes": {}, + "node_efa_rx_dropped": {}, + "node_efa_tx_bytes": {} + }, + "required": [ + "ClusterName", + "EfaDevice", + "InstanceId", + "InstanceType", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_efa.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_efa.json new file mode 100644 index 000000000..6560af558 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_efa.json @@ -0,0 +1,35 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "EfaDevice": {}, + "InstanceId": {}, + "InstanceType": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "kubernetes": {}, + "node_efa_rdma_read_bytes": {}, + "node_efa_rdma_write_bytes": {}, + "node_efa_rdma_write_recv_bytes": {}, + "node_efa_rx_bytes": {}, + "node_efa_rx_dropped": {}, + "node_efa_tx_bytes": {} + }, + "required": [ + "ClusterName", + "EfaDevice", + "InstanceId", + "InstanceType", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/pod_efa.json b/test/metric_value_benchmark/eks_resources/test_schemas/pod_efa.json new file mode 100644 index 000000000..6560af558 --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/pod_efa.json @@ -0,0 +1,35 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "CloudWatchMetrics": {}, + "ClusterName": {}, + "EfaDevice": {}, + "InstanceId": {}, + "InstanceType": {}, + "NodeName": {}, + "Timestamp": {}, + "Type": {}, + "Version": {}, + "kubernetes": {}, + "node_efa_rdma_read_bytes": {}, + "node_efa_rdma_write_bytes": {}, + "node_efa_rdma_write_recv_bytes": {}, + "node_efa_rx_bytes": {}, + "node_efa_rx_dropped": {}, + "node_efa_tx_bytes": {} + }, + "required": [ + "ClusterName", + "EfaDevice", + "InstanceId", + "InstanceType", + "NodeName", + "Timestamp", + "Type", + "Version", + "CloudWatchMetrics" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index bd737f85b..3e996a60e 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -56,6 +56,12 @@ var ( eksNodeNeuronDeviceSchema string //go:embed test_schemas/node_neuron.json eksNodeNeuronSchema string + //go:embed test_schemas/container_efa.json + eksContainerEfaSchema string + //go:embed test_schemas/pod_efa.json + eksPodEfaSchema string + //go:embed test_schemas/node_efa.json + eksNodeEfaSchema string EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, @@ -81,6 +87,9 @@ var ( "NodeAWSNeuronCore": eksNodeNeuronCoreSchema, "NodeAWSNeuronDevice": eksNodeNeuronDeviceSchema, "NodeAWSNeuron": eksNodeNeuronSchema, + "ContainerEFA": eksContainerEfaSchema, + "PodEFA": eksPodEfaSchema, + "NodeEFA": eksNodeEfaSchema, } EksClusterFrequencyValidationMap = map[string]int{