diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go
index a0021e438..e93f8404b 100644
--- a/generator/test_case_generator.go
+++ b/generator/test_case_generator.go
@@ -227,6 +227,10 @@ var testTypeToTestConfig = map[string][]testConfig{
testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
+ {
+ testDir: "./test/kueue", terraformDir: "terraform/eks/daemon/kueue",
+ targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
+ },
},
"eks_deployment": {
{testDir: "./test/metric_value_benchmark"},
diff --git a/terraform/eks/daemon/kueue/main.tf b/terraform/eks/daemon/kueue/main.tf
new file mode 100644
index 000000000..ba701ed6f
--- /dev/null
+++ b/terraform/eks/daemon/kueue/main.tf
@@ -0,0 +1,1450 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+ source = "../../../common"
+ cwagent_image_repo = var.cwagent_image_repo
+ cwagent_image_tag = var.cwagent_image_tag
+}
+
+module "basic_components" {
+ source = "../../../basic_components"
+
+ region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+ name = "cwagent-eks-integ-${module.common.testing_id}"
+ role_arn = module.basic_components.role_arn
+ version = var.k8s_version
+ enabled_cluster_log_types = [
+ "api",
+ "audit",
+ "authenticator",
+ "controllerManager",
+ "scheduler"
+ ]
+ vpc_config {
+ subnet_ids = module.basic_components.public_subnet_ids
+ security_group_ids = [module.basic_components.security_group]
+ }
+ tags = {
+ kueue = "true"
+ }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+ cluster_name = aws_eks_cluster.this.name
+ node_group_name = "cwagent-eks-integ-node"
+ node_role_arn = aws_iam_role.node_role.arn
+ subnet_ids = module.basic_components.public_subnet_ids
+
+ scaling_config {
+ desired_size = 1
+ max_size = 1
+ min_size = 1
+ }
+
+ ami_type = var.ami_type
+ capacity_type = "ON_DEMAND"
+ disk_size = 20
+ instance_types = [var.instance_type]
+
+ depends_on = [
+ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+ aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+ aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+ ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+ name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Effect = "Allow",
+ Principal = {
+ Service = "ec2.amazonaws.com"
+ },
+ Action = "sts:AssumeRole"
+ }
+ ]
+ })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+ role = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+ name = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+ description = "Cluster communication with worker nodes"
+ vpc_id = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+ description = "Allow worker nodes to communicate with the cluster API Server"
+ from_port = 443
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 443
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+ description = "Allow cluster API Server to communicate with the worker nodes"
+ from_port = 1024
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+ name = "cwagent-eks-node-sg-${module.common.testing_id}"
+ description = "Security group for all nodes in the cluster"
+ vpc_id = module.basic_components.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+ description = "Allow nodes to communicate with each other"
+ from_port = 0
+ protocol = "-1"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+ description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+ from_port = 1025
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_cluster_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+
+# create cert for communication between agent and kueue
+resource "tls_private_key" "private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+ content = tls_private_key.private_key.private_key_pem
+ filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+ private_key_pem = tls_private_key.private_key.private_key_pem
+ is_ca_certificate = true
+ subject {
+ common_name = "kueue-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+ validity_period_hours = 24
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "cert_signing",
+ "crl_signing",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "ca_cert_file" {
+ content = tls_self_signed_cert.ca_cert.cert_pem
+ filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+ content = tls_private_key.server_private_key.private_key_pem
+ filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+ private_key_pem = tls_private_key.server_private_key.private_key_pem
+ dns_names = ["localhost", "127.0.0.1", "kueue-exporter-service.amazon-cloudwatch.svc"]
+ subject {
+ common_name = "kueue-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+ cert_request_pem = tls_cert_request.local_csr.cert_request_pem
+ ca_private_key_pem = tls_private_key.private_key.private_key_pem
+ ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem
+ validity_period_hours = 12
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "server_cert_file" {
+ content = tls_locally_signed_cert.server_cert.cert_pem
+ filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+ metadata {
+ name = "amazon-cloudwatch-observability-agent-cert"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename)
+ "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename)
+ "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+ }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+ metadata {
+ name = "amazon-cloudwatch"
+ }
+}
+
+# dummy daemonset that simulates kueue exporter
+resource "kubernetes_daemonset" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_config_map.httpdconfig,
+ ]
+ metadata {
+ name = "kueue-exporter"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ k8s-app = "kueue-exporter"
+ }
+ }
+ spec {
+ selector {
+ match_labels = {
+ "k8s-app" = "kueue-exporter"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "kueue-exporter"
+ "k8s-app" : "kueue-exporter"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "kueue-exporter"
+ image = "httpd:2.4-alpine"
+ resources {
+ limits = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ requests = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ }
+ port {
+ name = "metrics"
+ container_port = 9400
+ host_port = 9400
+ protocol = "TCP"
+ }
+ command = [
+ "/bin/sh",
+ "-c",
+ ]
+ args = [
+ "bin/echo 'kueue_pending_workloads{queue=\"default\",namespace=\"kueue-system\"} 3\nkueue_pending_workloads{queue=\"high-priority\",namespace=\"kueue-system\"} 1\nkueue_evicted_workloads_total{queue=\"default\",namespace=\"kueue-system\"} 5\nkueue_evicted_workloads_total{queue=\"high-priority\",namespace=\"kueue-system\"} 0 \nkueue_admitted_active_workloads{queue=\"default\",namespace=\"kueue-system\"} 7 \nkueue_admitted_active_workloads{queue=\"high-priority\",namespace=\"kueue-system\"} 2 \nkueue_cluster_queue_resource_usage{queue=\"default\",resource=\"cpu\",namespace=\"kueue-system\"} 75 \nkueue_cluster_queue_resource_usage{queue=\"default\",resource=\"memory\",namespace=\"kueue-system\"} 60 \nkueue_cluster_queue_resource_usage{queue=\"high-priority\",resource=\"cpu\",namespace=\"kueue-system\"} 90 \nkueue_cluster_queue_resource_usage{queue=\"high-priority\",resource=\"memory\",namespace=\"kueue-system\"} 80 \nkueue_cluster_queue_nominal_quota{queue=\"default\",resource=\"cpu\",namespace=\"kueue-system\"} 100 \nkueue_cluster_queue_nominal_quota{queue=\"default\",resource=\"memory\",namespace=\"kueue-system\"} 100 \nkueue_cluster_queue_nominal_quota{queue=\"high-priority\",resource=\"cpu\",namespace=\"kueue-system\"} 200 \nkueue_cluster_queue_nominal_quota{queue=\"high-priority\",resource=\"memory\",namespace=\"kueue-system\"} 200' >> /usr/local/apache2/htdocs/metrics && sed -i -e \\\"s/hostname1/$HOST_NAME/g\\\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
+ ]
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-kueue-cert"
+ name = "kueuetls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/httpd.conf"
+ sub_path = "httpd.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf"
+ sub_path = "httpd-ssl.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ }
+ volume {
+ name = "kueuetls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "tls.crt"
+ path = "server.crt"
+ }
+ items {
+ key = "tls.key"
+ path = "server.key"
+ }
+ }
+ }
+ volume {
+ name = "httpdconfig"
+ config_map {
+ name = "httpdconfig"
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+resource "kubernetes_service" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.exporter
+ ]
+ metadata {
+ name = "kueue-exporter-service"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ "k8s-app" : "kueue-exporter-service"
+ }
+ annotations = {
+ "prometheus.io/scrape" : "true"
+ }
+ }
+ spec {
+ type = "ClusterIP"
+ selector = {
+ k8s-app = "kueue-exporter"
+ }
+ port {
+ name = "metrics"
+ port = 9400
+ target_port = 9400
+ protocol = "TCP"
+ }
+ }
+}
+
+resource "kubernetes_daemonset" "service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_service.exporter
+ ]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+ spec {
+ selector {
+ match_labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "cwagent"
+ image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+ image_pull_policy = "Always"
+ resources {
+ limits = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ requests = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ }
+ port {
+ container_port = 25888
+ host_port = 25888
+ protocol = "UDP"
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ volume_mount {
+ mount_path = "/etc/cwagentconfig"
+ name = "cwagentconfig"
+ }
+ volume_mount {
+ mount_path = "/rootfs"
+ name = "rootfs"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/run/docker.sock"
+ name = "dockersock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/docker"
+ name = "varlibdocker"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/run/containerd/containerd.sock"
+ name = "containerdsock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/sys"
+ name = "sys"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/dev/disk"
+ name = "devdisk"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+ name = "agenttls"
+ read_only = true
+ }
+ }
+ volume {
+ name = "cwagentconfig"
+ config_map {
+ name = "cwagentconfig"
+ }
+ }
+ volume {
+ name = "rootfs"
+ host_path {
+ path = "/"
+ }
+ }
+ volume {
+ name = "dockersock"
+ host_path {
+ path = "/var/run/docker.sock"
+ }
+ }
+ volume {
+ name = "varlibdocker"
+ host_path {
+ path = "/var/lib/docker"
+ }
+ }
+ volume {
+ name = "containerdsock"
+ host_path {
+ path = "/run/containerd/containerd.sock"
+ }
+ }
+ volume {
+ name = "sys"
+ host_path {
+ path = "/sys"
+ }
+ }
+ volume {
+ name = "devdisk"
+ host_path {
+ path = "/dev/disk"
+ }
+ }
+ volume {
+ name = "agenttls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "ca.crt"
+ path = "tls-ca.crt"
+ }
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+ httpd_config = "../../../../${var.test_dir}/resources/httpd.conf"
+ httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+ cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+ template = file(local.cwagent_config)
+ vars = {
+ }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "cwagentconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+ }
+}
+
+data "template_file" "httpd_config" {
+ template = file(local.httpd_config)
+ vars = {}
+}
+data "template_file" "httpd_ssl_config" {
+ template = file(local.httpd_ssl_config)
+ vars = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "httpdconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "httpd.conf" : data.template_file.httpd_config.rendered
+ "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+ }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role"
+ }
+ rule {
+ verbs = ["get", "list", "watch"]
+ resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["replicasets"]
+ api_groups = ["apps"]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["jobs"]
+ api_groups = ["batch"]
+ }
+ rule {
+ verbs = ["get"]
+ resources = ["nodes/proxy"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["create"]
+ resources = ["nodes/stats", "configmaps", "events"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get", "update"]
+ resource_names = ["cwagent-clusterleader"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["services"]
+ api_groups = [""]
+ }
+ rule {
+ non_resource_urls = ["/metrics"]
+ verbs = ["get", "list", "watch"]
+ }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role-binding"
+ }
+ role_ref {
+ api_group = "rbac.authorization.k8s.io"
+ kind = "ClusterRole"
+ name = "cloudwatch-agent-role"
+ }
+ subject {
+ kind = "ServiceAccount"
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+resource "null_resource" "validator" {
+ depends_on = [
+ aws_eks_node_group.this,
+ kubernetes_daemonset.service,
+ kubernetes_cluster_role_binding.rolebinding,
+ kubernetes_service_account.cwagentservice,
+ ]
+
+ provisioner "local-exec" {
+ command = <<-EOT
+ cd ../../../..
+ i=0
+ while [ $i -lt 10 ]; do
+ i=$((i+1))
+ go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0
+ sleep 60
+ done
+ exit 1
+ EOT
+ }
+} // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+ source = "../../../common"
+ cwagent_image_repo = var.cwagent_image_repo
+ cwagent_image_tag = var.cwagent_image_tag
+}
+
+module "basic_components" {
+ source = "../../../basic_components"
+
+ region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+ name = "cwagent-eks-integ-${module.common.testing_id}"
+ role_arn = module.basic_components.role_arn
+ version = var.k8s_version
+ enabled_cluster_log_types = [
+ "api",
+ "audit",
+ "authenticator",
+ "controllerManager",
+ "scheduler"
+ ]
+ vpc_config {
+ subnet_ids = module.basic_components.public_subnet_ids
+ security_group_ids = [module.basic_components.security_group]
+ }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+ cluster_name = aws_eks_cluster.this.name
+ node_group_name = "cwagent-eks-integ-node"
+ node_role_arn = aws_iam_role.node_role.arn
+ subnet_ids = module.basic_components.public_subnet_ids
+
+ scaling_config {
+ desired_size = 1
+ max_size = 1
+ min_size = 1
+ }
+
+ ami_type = var.ami_type
+ capacity_type = "ON_DEMAND"
+ disk_size = 20
+ instance_types = [var.instance_type]
+
+ depends_on = [
+ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+ aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+ aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+ ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+ name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Effect = "Allow",
+ Principal = {
+ Service = "ec2.amazonaws.com"
+ },
+ Action = "sts:AssumeRole"
+ }
+ ]
+ })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+ role = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+ name = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+ description = "Cluster communication with worker nodes"
+ vpc_id = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+ description = "Allow worker nodes to communicate with the cluster API Server"
+ from_port = 443
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 443
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+ description = "Allow cluster API Server to communicate with the worker nodes"
+ from_port = 1024
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+ name = "cwagent-eks-node-sg-${module.common.testing_id}"
+ description = "Security group for all nodes in the cluster"
+ vpc_id = module.basic_components.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+ description = "Allow nodes to communicate with each other"
+ from_port = 0
+ protocol = "-1"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+ description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+ from_port = 1025
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_cluster_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+
+# create cert for communication between agent and kueue
+resource "tls_private_key" "private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+ content = tls_private_key.private_key.private_key_pem
+ filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+ private_key_pem = tls_private_key.private_key.private_key_pem
+ is_ca_certificate = true
+ subject {
+ common_name = "kueue-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+ validity_period_hours = 24
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "cert_signing",
+ "crl_signing",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "ca_cert_file" {
+ content = tls_self_signed_cert.ca_cert.cert_pem
+ filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+ content = tls_private_key.server_private_key.private_key_pem
+ filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+ private_key_pem = tls_private_key.server_private_key.private_key_pem
+ dns_names = ["localhost", "127.0.0.1", "kueue-exporter-service.amazon-cloudwatch.svc"]
+ subject {
+ common_name = "kueue-exporter-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+ cert_request_pem = tls_cert_request.local_csr.cert_request_pem
+ ca_private_key_pem = tls_private_key.private_key.private_key_pem
+ ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem
+ validity_period_hours = 12
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "server_cert_file" {
+ content = tls_locally_signed_cert.server_cert.cert_pem
+ filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+ metadata {
+ name = "amazon-cloudwatch-observability-agent-cert"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename)
+ "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename)
+ "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+ }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+ metadata {
+ name = "amazon-cloudwatch"
+ }
+}
+
+# dummy daemonset that simulates kueue exporter
+resource "kubernetes_daemonset" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_config_map.httpdconfig,
+ ]
+ metadata {
+ name = "kueue-exporter"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ k8s-app = "kueue-exporter"
+ }
+ }
+ spec {
+ selector {
+ match_labels = {
+ "k8s-app" = "kueue-exporter"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "kueue-exporter"
+ "k8s-app" : "kueue-exporter"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "kueue-exporter"
+ image = "httpd:2.4-alpine"
+ resources {
+ limits = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ requests = {
+ "cpu" : "50m",
+ "memory" : "50Mi"
+ }
+ }
+ port {
+ name = "metrics"
+ container_port = 9400
+ host_port = 9400
+ protocol = "TCP"
+ }
+ command = [
+ "/bin/sh",
+ "-c",
+ ]
+ args = [
+ "bin/echo 'kueue_pending_workloads{queue=\"default\",namespace=\"kueue-system\"} 3\nkueue_pending_workloads{queue=\"high-priority\",namespace=\"kueue-system\"} 1\nkueue_evicted_workloads_total{queue=\"default\",namespace=\"kueue-system\"} 5\nkueue_evicted_workloads_total{queue=\"high-priority\",namespace=\"kueue-system\"} 0 \nkueue_admitted_active_workloads{queue=\"default\",namespace=\"kueue-system\"} 7 \nkueue_admitted_active_workloads{queue=\"high-priority\",namespace=\"kueue-system\"} 2 \nkueue_cluster_queue_resource_usage{queue=\"default\",resource=\"cpu\",namespace=\"kueue-system\"} 75 \nkueue_cluster_queue_resource_usage{queue=\"default\",resource=\"memory\",namespace=\"kueue-system\"} 60 \nkueue_cluster_queue_resource_usage{queue=\"high-priority\",resource=\"cpu\",namespace=\"kueue-system\"} 90 \nkueue_cluster_queue_resource_usage{queue=\"high-priority\",resource=\"memory\",namespace=\"kueue-system\"} 80 \nkueue_cluster_queue_nominal_quota{queue=\"default\",resource=\"cpu\",namespace=\"kueue-system\"} 100 \nkueue_cluster_queue_nominal_quota{queue=\"default\",resource=\"memory\",namespace=\"kueue-system\"} 100 \nkueue_cluster_queue_nominal_quota{queue=\"high-priority\",resource=\"cpu\",namespace=\"kueue-system\"} 200 \nkueue_cluster_queue_nominal_quota{queue=\"high-priority\",resource=\"memory\",namespace=\"kueue-system\"} 200' >> /usr/local/apache2/htdocs/metrics && sed -i -e \\\"s/hostname1/$HOST_NAME/g\\\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
+ ]
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-kueue-cert"
+ name = "kueuetls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/httpd.conf"
+ sub_path = "httpd.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf"
+ sub_path = "httpd-ssl.conf"
+ name = "httpdconfig"
+ read_only = true
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ }
+ volume {
+ name = "kueuetls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "tls.crt"
+ path = "server.crt"
+ }
+ items {
+ key = "tls.key"
+ path = "server.key"
+ }
+ }
+ }
+ volume {
+ name = "httpdconfig"
+ config_map {
+ name = "httpdconfig"
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+resource "kubernetes_service" "exporter" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.exporter
+ ]
+ metadata {
+ name = "kueue-exporter-service"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ "k8s-app" : "kueue-exporter-service"
+ }
+ annotations = {
+ "prometheus.io/scrape" : "true"
+ }
+ }
+ spec {
+ type = "ClusterIP"
+ selector = {
+ k8s-app = "kueue-exporter"
+ }
+ port {
+ name = "metrics"
+ port = 9400
+ target_port = 9400
+ protocol = "TCP"
+ }
+ }
+}
+
+resource "kubernetes_daemonset" "service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_service.exporter
+ ]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+ spec {
+ selector {
+ match_labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "cwagent"
+ image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+ image_pull_policy = "Always"
+ resources {
+ limits = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ requests = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ }
+ port {
+ container_port = 25888
+ host_port = 25888
+ protocol = "UDP"
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ volume_mount {
+ mount_path = "/etc/cwagentconfig"
+ name = "cwagentconfig"
+ }
+ volume_mount {
+ mount_path = "/rootfs"
+ name = "rootfs"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/run/docker.sock"
+ name = "dockersock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/docker"
+ name = "varlibdocker"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/run/containerd/containerd.sock"
+ name = "containerdsock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/sys"
+ name = "sys"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/dev/disk"
+ name = "devdisk"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+ name = "agenttls"
+ read_only = true
+ }
+ }
+ volume {
+ name = "cwagentconfig"
+ config_map {
+ name = "cwagentconfig"
+ }
+ }
+ volume {
+ name = "rootfs"
+ host_path {
+ path = "/"
+ }
+ }
+ volume {
+ name = "dockersock"
+ host_path {
+ path = "/var/run/docker.sock"
+ }
+ }
+ volume {
+ name = "varlibdocker"
+ host_path {
+ path = "/var/lib/docker"
+ }
+ }
+ volume {
+ name = "containerdsock"
+ host_path {
+ path = "/run/containerd/containerd.sock"
+ }
+ }
+ volume {
+ name = "sys"
+ host_path {
+ path = "/sys"
+ }
+ }
+ volume {
+ name = "devdisk"
+ host_path {
+ path = "/dev/disk"
+ }
+ }
+ volume {
+ name = "agenttls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "ca.crt"
+ path = "tls-ca.crt"
+ }
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+ httpd_config = "../../../../${var.test_dir}/resources/httpd.conf"
+ httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+ cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+ template = file(local.cwagent_config)
+ vars = {
+ }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "cwagentconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+ }
+}
+
+data "template_file" "httpd_config" {
+ template = file(local.httpd_config)
+ vars = {}
+}
+data "template_file" "httpd_ssl_config" {
+ template = file(local.httpd_ssl_config)
+ vars = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "httpdconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "httpd.conf" : data.template_file.httpd_config.rendered
+ "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+ }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role"
+ }
+ rule {
+ verbs = ["get", "list", "watch"]
+ resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["replicasets"]
+ api_groups = ["apps"]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["jobs"]
+ api_groups = ["batch"]
+ }
+ rule {
+ verbs = ["get"]
+ resources = ["nodes/proxy"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["create"]
+ resources = ["nodes/stats", "configmaps", "events"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get", "update"]
+ resource_names = ["cwagent-clusterleader"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["services"]
+ api_groups = [""]
+ }
+ rule {
+ non_resource_urls = ["/metrics"]
+ verbs = ["get", "list", "watch"]
+ }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role-binding"
+ }
+ role_ref {
+ api_group = "rbac.authorization.k8s.io"
+ kind = "ClusterRole"
+ name = "cloudwatch-agent-role"
+ }
+ subject {
+ kind = "ServiceAccount"
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+resource "null_resource" "validator" {
+ depends_on = [
+ aws_eks_node_group.this,
+ kubernetes_daemonset.service,
+ kubernetes_cluster_role_binding.rolebinding,
+ kubernetes_service_account.cwagentservice,
+ ]
+
+ provisioner "local-exec" {
+ command = <<-EOT
+ cd ../../../..
+ i=0
+ while [ $i -lt 10 ]; do
+ i=$((i+1))
+ go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0
+ sleep 60
+ done
+ exit 1
+ EOT
+ }
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/kueue/providers.tf b/terraform/eks/daemon/kueue/providers.tf
new file mode 100644
index 000000000..9bd2885f5
--- /dev/null
+++ b/terraform/eks/daemon/kueue/providers.tf
@@ -0,0 +1,17 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+provider "aws" {
+ region = var.region
+}
+
+provider "kubernetes" {
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ command = "aws"
+ args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
+ }
+ host = aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.this.token
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/kueue/variables.tf b/terraform/eks/daemon/kueue/variables.tf
new file mode 100644
index 000000000..6ffbea6ec
--- /dev/null
+++ b/terraform/eks/daemon/kueue/variables.tf
@@ -0,0 +1,32 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+variable "region" {
+ type = string
+ default = "us-west-2"
+}
+variable "test_dir" {
+ type = string
+ default = "./test/kueue"
+}
+variable "cwagent_image_repo" {
+ type = string
+ default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
+}
+variable "cwagent_image_tag" {
+ type = string
+ default = "latest"
+}
+variable "k8s_version" {
+ type = string
+ default = "1.31"
+}
+
+variable "ami_type" {
+ type = string
+ default = "AL2_x86_64"
+}
+
+variable "instance_type" {
+ type = string
+ default = "m5.large"
+}
\ No newline at end of file
diff --git a/test/kueue/kueue-metrics-test.go b/test/kueue/kueue-metrics-test.go
new file mode 100644
index 000000000..63926a088
--- /dev/null
+++ b/test/kueue/kueue-metrics-test.go
@@ -0,0 +1,95 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package keu
+
+import (
+ "time"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ . "github.com/aws/amazon-cloudwatch-agent-test/test/kueue/resources"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+const (
+ awsKueueMetricIndicator = "_neuron"
+)
+
+var expectedDimsToMetrics = map[string][]string{
+ "ClusterName-ClusterQueue-Status": {
+ KueuePendingWorkloads,
+ },
+ "ClusterName-ClusterQueue": {
+ KueuePendingWorkloads,
+ KueueEvictedWorkloadsTotal,
+ KueueAdmittedActiveWorkloads,
+ KueueClusterQueueResourceUsage,
+ },
+ "ClusterName-Status": {
+ KueuePendingWorkloads,
+ },
+ "ClusterName": {
+ KueuePendingWorkloads,
+ KueueEvictedWorkloadsTotal,
+ KueueAdmittedActiveWorkloads,
+ KueueClusterQueueResourceUsage,
+ KueueClusterQueueNominalQuota,
+ },
+ "ClusterName-Reason": {
+ KueueEvictedWorkloadsTotal,
+ },
+ "ClusterName-ClusterQueue-Reason": {
+ KueueEvictedWorkloadsTotal,
+ },
+ "ClusterName-ClusterQueue-Resource-Flavor": {
+ KueueClusterQueueResourceUsage,
+ KueueClusterQueueNominalQuota,
+ },
+ "ClusterName-ClusterQueue-Resource": {
+ KueueClusterQueueResourceUsage,
+ KueueClusterQueueNominalQuota,
+ },
+ "ClusterName-ClusterQueue-Flavor": {
+ KueueClusterQueueResourceUsage,
+ KueueClusterQueueNominalQuota,
+ },
+}
+
+type AwsKueueTestRunner struct {
+ test_runner.BaseTestRunner
+ testName string
+ env *environment.MetaData
+}
+
+var _ test_runner.ITestRunner = (*AwsKueueTestRunner)(nil)
+
+func (t *AwsKueueTestRunner) Validate() status.TestGroupResult {
+ var testResults []status.TestResult
+ testResults = append(testResults, metric.ValidateMetrics(t.env, awsKueueMetricIndicator, expectedDimsToMetrics)...)
+ testResults = append(testResults, metric.ValidateLogs(t.env))
+ testResults = append(testResults, metric.ValidateLogsFrequency(t.env))
+ return status.TestGroupResult{
+ Name: t.GetTestName(),
+ TestResults: testResults,
+ }
+}
+
+func (t *AwsKueueTestRunner) GetTestName() string {
+ return t.testName
+}
+
+func (t *AwsKueueTestRunner) GetAgentConfigFileName() string {
+ return ""
+}
+
+func (t *AwsKueueTestRunner) GetAgentRunDuration() time.Duration {
+ return 25 * time.Minute
+}
+
+func (t *AwsKueueTestRunner) GetMeasuredMetrics() []string {
+ return nil
+}
diff --git a/test/kueue/resources/config.json b/test/kueue/resources/config.json
new file mode 100644
index 000000000..8c4153be5
--- /dev/null
+++ b/test/kueue/resources/config.json
@@ -0,0 +1,17 @@
+{
+ "agent": {
+ "metrics_collection_interval": 15,
+ "run_as_user": "root",
+ "debug": true,
+ "logfile": ""
+ },
+ "logs": {
+ "metrics_collected": {
+ "kubernetes": {
+ "enhanced_container_insights": true,
+ "kueue_container_insights":true
+ }
+ },
+ "force_flush_interval": 5
+ }
+}
\ No newline at end of file
diff --git a/test/kueue/resources/httpd.conf b/test/kueue/resources/httpd.conf
new file mode 100644
index 000000000..058db5063
--- /dev/null
+++ b/test/kueue/resources/httpd.conf
@@ -0,0 +1,101 @@
+
+ServerRoot "/usr/local/apache2"
+
+#Listen 9400
+
+LoadModule mpm_event_module modules/mod_mpm_event.so
+LoadModule authn_file_module modules/mod_authn_file.so
+LoadModule authn_core_module modules/mod_authn_core.so
+LoadModule authz_host_module modules/mod_authz_host.so
+LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+LoadModule authz_user_module modules/mod_authz_user.so
+LoadModule authz_core_module modules/mod_authz_core.so
+LoadModule access_compat_module modules/mod_access_compat.so
+LoadModule auth_basic_module modules/mod_auth_basic.so
+LoadModule socache_shmcb_module modules/mod_socache_shmcb.so
+LoadModule reqtimeout_module modules/mod_reqtimeout.so
+LoadModule filter_module modules/mod_filter.so
+LoadModule mime_module modules/mod_mime.so
+LoadModule log_config_module modules/mod_log_config.so
+LoadModule env_module modules/mod_env.so
+LoadModule headers_module modules/mod_headers.so
+LoadModule setenvif_module modules/mod_setenvif.so
+LoadModule version_module modules/mod_version.so
+LoadModule ssl_module modules/mod_ssl.so
+LoadModule unixd_module modules/mod_unixd.so
+LoadModule status_module modules/mod_status.so
+LoadModule autoindex_module modules/mod_autoindex.so
+LoadModule dir_module modules/mod_dir.so
+LoadModule alias_module modules/mod_alias.so
+
+
+User www-data
+Group www-data
+
+
+
+ AllowOverride none
+ Require all denied
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+
+ Options Indexes FollowSymLinks
+ AllowOverride None
+ Require all granted
+
+
+
+ DirectoryIndex index.html
+
+
+
+ Require all denied
+
+
+ErrorLog /proc/self/fd/2
+
+LogLevel warn
+
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined
+ LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+
+ # You need to enable mod_logio.c to use %I and %O
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio
+
+
+ CustomLog /proc/self/fd/1 common
+
+
+
+ ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/"
+
+
+
+ AllowOverride None
+ Options None
+ Require all granted
+
+
+
+ RequestHeader unset Proxy early
+
+
+
+ TypesConfig conf/mime.types
+ AddType application/x-compress .Z
+ AddType application/x-gzip .gz .tgz
+
+
+
+Include conf/extra/proxy-html.conf
+
+
+# Secure (SSL/TLS) connections
+Include conf/extra/httpd-ssl.conf
+
+SSLRandomSeed startup builtin
+SSLRandomSeed connect builtin
+
\ No newline at end of file
diff --git a/test/kueue/resources/metrics_list.go b/test/kueue/resources/metrics_list.go
new file mode 100644
index 000000000..18e06c661
--- /dev/null
+++ b/test/kueue/resources/metrics_list.go
@@ -0,0 +1,12 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package resources
+
+const (
+ KueuePendingWorkloads = "kueue_pending_workloads"
+ KueueEvictedWorkloadsTotal = "kueue_evicted_workloads_total"
+ KueueAdmittedActiveWorkloads = "kueue_admitted_active_workloads"
+ KueueClusterQueueResourceUsage = "kueue_cluster_queue_resource_usage"
+ KueueClusterQueueNominalQuota = "kueue_cluster_queue_nominal_quota"
+)
diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go
index a61a3daf5..d35b0d616 100644
--- a/test/metric/container_insights_util.go
+++ b/test/metric/container_insights_util.go
@@ -181,6 +181,7 @@ func ValidateLogs(env *environment.MetaData) status.TestResult {
}
end := time.Now()
+
start := end.Add(time.Duration(-3) * time.Minute)
group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)
@@ -225,6 +226,41 @@ func ValidateLogs(env *environment.MetaData) status.TestResult {
}
}
+ // test for kueue metrics (the log group name is not a DNS address, it is kubernetes-kueue)
+ // ideally we only want to do this if kueue is enabled
+ if strings.Contains(env.EKSClusterName, "kueue") {
+ err = awsservice.ValidateLogs(
+ group,
+ "kubernetes-kueue",
+ &start,
+ &end,
+ awsservice.AssertLogsNotEmpty(),
+ //awsservice.AssertNoDuplicateLogs(),
+ awsservice.AssertPerLog(
+ awsservice.AssertLogSchema(func(message string) (string, error) {
+ var eksClusterType awsservice.EKSClusterType
+ innerErr := json.Unmarshal([]byte(message), &eksClusterType)
+ if innerErr != nil {
+ return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr)
+ }
+
+ log.Printf("eksClusterType is: %s", eksClusterType.Type)
+ jsonSchema1, ok1 := eks_resources.EksClusterValidationMap["eksNodeKueueSchema"]
+ jsonSchema2, ok2 := eks_resources.EksClusterValidationMap["eksNodeKueueUsageSchema"]
+ if !(ok1 || ok2) {
+ return "", errors.New("invalid cluster type provided")
+ }
+ if ok1 {
+ return jsonSchema1, nil
+ } else {
+ return jsonSchema2, nil
+ }
+ }),
+ awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)),
+ ),
+ )
+ }
+
testResult.Status = status.SUCCESSFUL
return testResult
}
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json
new file mode 100644
index 000000000..1f65ef83a
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json
@@ -0,0 +1,28 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "ClusterName": {},
+ "ClusterQueue": {},
+ "Flavor": {},
+ "NodeName": {},
+ "Resource": {},
+ "Timestamp": {},
+ "Version": {},
+ "kueue_cluster_queue_nominal_quota": {},
+ "kueue_cluster_queue_resource_usage": {}
+ },
+ "required": [
+ "ClusterName",
+ "ClusterQueue",
+ "Flavor",
+ "NodeName",
+ "Resource",
+ "Timestamp",
+ "Version",
+ "kueue_cluster_queue_nominal_quota",
+ "kueue_cluster_queue_resource_usage"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json
new file mode 100644
index 000000000..e5c73217c
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json
@@ -0,0 +1,24 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "ClusterName": {},
+ "ClusterQueue": {},
+ "NodeName": {},
+ "Status": {},
+ "Timestamp": {},
+ "Version": {},
+ "kueue_pending_workloads": {}
+ },
+ "required": [
+ "ClusterName",
+ "ClusterQueue",
+ "NodeName",
+ "Status",
+ "Timestamp",
+ "Version",
+ "kueue_pending_workloads"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go
index bd737f85b..d44685333 100644
--- a/test/metric_value_benchmark/eks_resources/util.go
+++ b/test/metric_value_benchmark/eks_resources/util.go
@@ -56,6 +56,10 @@ var (
eksNodeNeuronDeviceSchema string
//go:embed test_schemas/node_neuron.json
eksNodeNeuronSchema string
+ //go:embed test_schemas/cluster_kueue.json
+ eksNodeKueueSchema string
+ //go:embed test_schemas/cluster_kueue_usage.json
+ eksNodeKueueUsageSchema string
EksClusterValidationMap = map[string]string{
"Cluster": eksClusterSchema,
@@ -81,6 +85,8 @@ var (
"NodeAWSNeuronCore": eksNodeNeuronCoreSchema,
"NodeAWSNeuronDevice": eksNodeNeuronDeviceSchema,
"NodeAWSNeuron": eksNodeNeuronSchema,
+ "NodeKueue": eksNodeKueueSchema,
+ "NodeKueueUsage": eksNodeKueueUsageSchema,
}
EksClusterFrequencyValidationMap = map[string]int{