From 69d7756414fc49fe3ecfce2e2ba633bda316e556 Mon Sep 17 00:00:00 2001 From: Srikanth Nemana Date: Fri, 8 Nov 2024 08:40:51 -0800 Subject: [PATCH] kueue metrics integration tests --- generator/test_case_generator.go | 4 + terraform/eks/daemon/kueue/main.tf | 738 ++++++++++++++++++ terraform/eks/daemon/kueue/providers.tf | 17 + terraform/eks/daemon/kueue/variables.tf | 32 + test/awsneuron/resources/metrics_list.go | 2 +- test/kueue/kueue-metrics-test.go | 94 +++ test/kueue/resources/config.json | 17 + test/kueue/resources/httpd.conf | 101 +++ test/kueue/resources/metrics_list.go | 12 + test/metric/container_insights_util.go | 35 + .../test_schemas/cluster_kueue.json | 28 + .../test_schemas/cluster_kueue_usage.json | 24 + .../eks_resources/util.go | 7 + util/awsservice/eks.go | 1 + 14 files changed, 1111 insertions(+), 1 deletion(-) create mode 100644 terraform/eks/daemon/kueue/main.tf create mode 100644 terraform/eks/daemon/kueue/providers.tf create mode 100644 terraform/eks/daemon/kueue/variables.tf create mode 100644 test/kueue/kueue-metrics-test.go create mode 100644 test/kueue/resources/config.json create mode 100644 test/kueue/resources/httpd.conf create mode 100644 test/kueue/resources/metrics_list.go create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json create mode 100644 test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index a0021e438..69df18dd0 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -227,6 +227,10 @@ var testTypeToTestConfig = map[string][]testConfig{ testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron", targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, }, + { + testDir: "./test/kueue", terraformDir: "terraform/eks/daemon/kueue", + targets: map[string]map[string]struct{}{"arc": {"amd64": {}}}, + }, }, "eks_deployment": { {testDir: "./test/metric_value_benchmark"}, diff --git a/terraform/eks/daemon/kueue/main.tf b/terraform/eks/daemon/kueue/main.tf new file mode 100644 index 000000000..3ab43c063 --- /dev/null +++ b/terraform/eks/daemon/kueue/main.tf @@ -0,0 +1,738 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +module "common" { + source = "../../../common" + cwagent_image_repo = var.cwagent_image_repo + cwagent_image_tag = var.cwagent_image_tag +} + +module "basic_components" { + source = "../../../basic_components" + + region = var.region +} + +data "aws_eks_cluster_auth" "this" { + name = aws_eks_cluster.this.name +} + +resource "aws_eks_cluster" "this" { + name = "cwagent-eks-integ-${module.common.testing_id}" + role_arn = module.basic_components.role_arn + version = var.k8s_version + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + vpc_config { + subnet_ids = module.basic_components.public_subnet_ids + security_group_ids = [module.basic_components.security_group] + } +} + +# EKS Node Groups +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "cwagent-eks-integ-node" + node_role_arn = aws_iam_role.node_role.arn + subnet_ids = module.basic_components.public_subnet_ids + + scaling_config { + desired_size = 1 + max_size = 1 + min_size = 1 + } + + ami_type = var.ami_type + capacity_type = "ON_DEMAND" + disk_size = 20 + instance_types = [var.instance_type] + + depends_on = [ + aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly, + aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy, + aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy, + aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy + ] +} + +# EKS Node IAM Role +resource "aws_iam_role" "node_role" { + name = "cwagent-eks-Worker-Role-${module.common.testing_id}" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) + +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node_role.name +} + +resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" { + policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + role = aws_iam_role.node_role.name +} + +# TODO: these security groups be created once and then reused +# EKS Cluster Security Group +resource "aws_security_group" "eks_cluster_sg" { + name = "cwagent-eks-cluster-sg-${module.common.testing_id}" + description = "Cluster communication with worker nodes" + vpc_id = module.basic_components.vpc_id +} + +resource "aws_security_group_rule" "cluster_inbound" { + description = "Allow worker nodes to communicate with the cluster API Server" + from_port = 443 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 443 + type = "ingress" +} + +resource "aws_security_group_rule" "cluster_outbound" { + description = "Allow cluster API Server to communicate with the worker nodes" + from_port = 1024 + protocol = "tcp" + security_group_id = aws_security_group.eks_cluster_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "egress" +} + + +# EKS Node Security Group +resource "aws_security_group" "eks_nodes_sg" { + name = "cwagent-eks-node-sg-${module.common.testing_id}" + description = "Security group for all nodes in the cluster" + vpc_id = module.basic_components.vpc_id + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_security_group_rule" "nodes_internal" { + description = "Allow nodes to communicate with each other" + from_port = 0 + protocol = "-1" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_nodes_sg.id + to_port = 65535 + type = "ingress" +} + +resource "aws_security_group_rule" "nodes_cluster_inbound" { + description = "Allow worker Kubelets and pods to receive communication from the cluster control plane" + from_port = 1025 + protocol = "tcp" + security_group_id = aws_security_group.eks_nodes_sg.id + source_security_group_id = aws_security_group.eks_cluster_sg.id + to_port = 65535 + type = "ingress" +} + + +# create cert for communication between agent and dcgm +resource "tls_private_key" "private_key" { + algorithm = "RSA" +} + +resource "local_file" "ca_key" { + content = tls_private_key.private_key.private_key_pem + filename = "${path.module}/certs/ca.key" +} + +resource "tls_self_signed_cert" "ca_cert" { + private_key_pem = tls_private_key.private_key.private_key_pem + is_ca_certificate = true + subject { + common_name = "dcgm-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } + validity_period_hours = 24 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "cert_signing", + "crl_signing", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "ca_cert_file" { + content = tls_self_signed_cert.ca_cert.cert_pem + filename = "${path.module}/certs/ca.cert" +} + +resource "tls_private_key" "server_private_key" { + algorithm = "RSA" +} + +resource "local_file" "server_key" { + content = tls_private_key.server_private_key.private_key_pem + filename = "${path.module}/certs/server.key" +} + +resource "tls_cert_request" "local_csr" { + private_key_pem = tls_private_key.server_private_key.private_key_pem + dns_names = ["localhost", "127.0.0.1", "kueue-exporter-service.amazon-cloudwatch.svc"] + subject { + common_name = "kueue-exporter-service.amazon-cloudwatch.svc" + organization = "Amazon CloudWatch Agent" + } +} + +resource "tls_locally_signed_cert" "server_cert" { + cert_request_pem = tls_cert_request.local_csr.cert_request_pem + ca_private_key_pem = tls_private_key.private_key.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem + validity_period_hours = 12 + allowed_uses = [ + "digital_signature", + "key_encipherment", + "server_auth", + "client_auth", + ] +} + +resource "local_file" "server_cert_file" { + content = tls_locally_signed_cert.server_cert.cert_pem + filename = "${path.module}/certs/server.cert" +} + +resource "kubernetes_secret" "agent_cert" { + metadata { + name = "amazon-cloudwatch-observability-agent-cert" + namespace = "amazon-cloudwatch" + } + data = { + "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename) + "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename) + "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename) + } +} + + +resource "kubernetes_namespace" "namespace" { + metadata { + name = "amazon-cloudwatch" + } +} + +# dummy daemonset that simulates dcgm-exporter assuming there is only 1 GPU available +resource "kubernetes_daemonset" "exporter" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_config_map.httpdconfig, + ] + metadata { + name = "kueue-exporter" + namespace = "amazon-cloudwatch" + labels = { + k8s-app = "kueue-exporter" + } + } + spec { + selector { + match_labels = { + "k8s-app" = "kueue-exporter" + } + } + template { + metadata { + labels = { + "name" : "kueue-exporter" + "k8s-app" : "kueue-exporter" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "kueue-exporter" + image = "httpd:2.4-alpine" + resources { + limits = { + "cpu" : "50m", + "memory" : "50Mi" + } + requests = { + "cpu" : "50m", + "memory" : "50Mi" + } + } + port { + name = "metrics" + container_port = 9400 + host_port = 9400 + protocol = "TCP" + } + command = [ + "/bin/sh", + "-c", + ] + args = [ + "bin/echo 'kueue_pending_workloads{queue="default",namespace="kueue-system"} 3 + kueue_pending_workloads{queue="high-priority",namespace="kueue-system"} 1 + kueue_evicted_workloads_total{queue="default",namespace="kueue-system"} 5 + kueue_evicted_workloads_total{queue="high-priority",namespace="kueue-system"} 0 + kueue_admitted_active_workloads{queue="default",namespace="kueue-system"} 7 + kueue_admitted_active_workloads{queue="high-priority",namespace="kueue-system"} 2 + kueue_cluster_queue_resource_usage{queue="default",resource="cpu",namespace="kueue-system"} 75 + kueue_cluster_queue_resource_usage{queue="default",resource="memory",namespace="kueue-system"} 60 + kueue_cluster_queue_resource_usage{queue="high-priority",resource="cpu",namespace="kueue-system"} 90 + kueue_cluster_queue_resource_usage{queue="high-priority",resource="memory",namespace="kueue-system"} 80 + kueue_cluster_queue_nominal_quota{queue="default",resource="cpu",namespace="kueue-system"} 100 + kueue_cluster_queue_nominal_quota{queue="default",resource="memory",namespace="kueue-system"} 100 + kueue_cluster_queue_nominal_quota{queue="high-priority",resource="cpu",namespace="kueue-system"} 200 + kueue_cluster_queue_nominal_quota{queue="high-priority",resource="memory",namespace="kueue-system"} 200' + >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart" + ] + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-kueue-cert" + name = "kueuetls" + read_only = true + } + volume_mount { + mount_path = "/usr/local/apache2/conf/httpd.conf" + sub_path = "httpd.conf" + name = "httpdconfig" + read_only = true + } + volume_mount { + mount_path = "/usr/local/apache2/conf/extra/httpd-ssl.conf" + sub_path = "httpd-ssl.conf" + name = "httpdconfig" + read_only = true + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + } + volume { + name = "dcgmtls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "tls.crt" + path = "server.crt" + } + items { + key = "tls.key" + path = "server.key" + } + } + } + volume { + name = "httpdconfig" + config_map { + name = "httpdconfig" + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +resource "kubernetes_service" "exporter" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_daemonset.exporter + ] + metadata { + name = "kueue-exporter-service" + namespace = "amazon-cloudwatch" + labels = { + "k8s-app" : "kueue-exporter-service" + } + annotations = { + "prometheus.io/scrape" : "true" + } + } + spec { + type = "ClusterIP" + selector = { + k8s-app = "kueue-exporter" + } + port { + name = "metrics" + port = 9400 + target_port = 9400 + protocol = "TCP" + } + } +} + +resource "kubernetes_daemonset" "service" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice, + aws_eks_node_group.this, + kubernetes_service.exporter + ] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } + spec { + selector { + match_labels = { + "name" : "cloudwatch-agent" + } + } + template { + metadata { + labels = { + "name" : "cloudwatch-agent" + } + } + spec { + node_selector = { + "kubernetes.io/os" : "linux" + } + container { + name = "cwagent" + image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}" + image_pull_policy = "Always" + resources { + limits = { + "cpu" : "200m", + "memory" : "200Mi" + } + requests = { + "cpu" : "200m", + "memory" : "200Mi" + } + } + port { + container_port = 25888 + host_port = 25888 + protocol = "UDP" + } + env { + name = "HOST_IP" + value_from { + field_ref { + field_path = "status.hostIP" + } + } + } + env { + name = "HOST_NAME" + value_from { + field_ref { + field_path = "spec.nodeName" + } + } + } + env { + name = "K8S_NAMESPACE" + value_from { + field_ref { + field_path = "metadata.namespace" + } + } + } + volume_mount { + mount_path = "/etc/cwagentconfig" + name = "cwagentconfig" + } + volume_mount { + mount_path = "/rootfs" + name = "rootfs" + read_only = true + } + volume_mount { + mount_path = "/var/run/docker.sock" + name = "dockersock" + read_only = true + } + volume_mount { + mount_path = "/var/lib/docker" + name = "varlibdocker" + read_only = true + } + volume_mount { + mount_path = "/run/containerd/containerd.sock" + name = "containerdsock" + read_only = true + } + volume_mount { + mount_path = "/sys" + name = "sys" + read_only = true + } + volume_mount { + mount_path = "/dev/disk" + name = "devdisk" + read_only = true + } + volume_mount { + mount_path = "/etc/amazon-cloudwatch-observability-agent-cert" + name = "agenttls" + read_only = true + } + } + volume { + name = "cwagentconfig" + config_map { + name = "cwagentconfig" + } + } + volume { + name = "rootfs" + host_path { + path = "/" + } + } + volume { + name = "dockersock" + host_path { + path = "/var/run/docker.sock" + } + } + volume { + name = "varlibdocker" + host_path { + path = "/var/lib/docker" + } + } + volume { + name = "containerdsock" + host_path { + path = "/run/containerd/containerd.sock" + } + } + volume { + name = "sys" + host_path { + path = "/sys" + } + } + volume { + name = "devdisk" + host_path { + path = "/dev/disk" + } + } + volume { + name = "agenttls" + secret { + secret_name = "amazon-cloudwatch-observability-agent-cert" + items { + key = "ca.crt" + path = "tls-ca.crt" + } + } + } + service_account_name = "cloudwatch-agent" + termination_grace_period_seconds = 60 + } + } + } +} + +########################################## +# Template Files +########################################## +locals { + httpd_config = "../../../../${var.test_dir}/resources/httpd.conf" + httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf" + cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json" +} + +data "template_file" "cwagent_config" { + template = file(local.cwagent_config) + vars = { + } +} + +resource "kubernetes_config_map" "cwagentconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "cwagentconfig" + namespace = "amazon-cloudwatch" + } + data = { + "cwagentconfig.json" : data.template_file.cwagent_config.rendered + } +} + +data "template_file" "httpd_config" { + template = file(local.httpd_config) + vars = {} +} +data "template_file" "httpd_ssl_config" { + template = file(local.httpd_ssl_config) + vars = {} +} + +resource "kubernetes_config_map" "httpdconfig" { + depends_on = [ + kubernetes_namespace.namespace, + kubernetes_service_account.cwagentservice + ] + metadata { + name = "httpdconfig" + namespace = "amazon-cloudwatch" + } + data = { + "httpd.conf" : data.template_file.httpd_config.rendered + "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered + } +} + +resource "kubernetes_service_account" "cwagentservice" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} + +resource "kubernetes_cluster_role" "clusterrole" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role" + } + rule { + verbs = ["get", "list", "watch"] + resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["replicasets"] + api_groups = ["apps"] + } + rule { + verbs = ["list", "watch"] + resources = ["jobs"] + api_groups = ["batch"] + } + rule { + verbs = ["get"] + resources = ["nodes/proxy"] + api_groups = [""] + } + rule { + verbs = ["create"] + resources = ["nodes/stats", "configmaps", "events"] + api_groups = [""] + } + rule { + verbs = ["get", "update"] + resource_names = ["cwagent-clusterleader"] + resources = ["configmaps"] + api_groups = [""] + } + rule { + verbs = ["list", "watch"] + resources = ["services"] + api_groups = [""] + } + rule { + non_resource_urls = ["/metrics"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding" "rolebinding" { + depends_on = [kubernetes_namespace.namespace] + metadata { + name = "cloudwatch-agent-role-binding" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cloudwatch-agent-role" + } + subject { + kind = "ServiceAccount" + name = "cloudwatch-agent" + namespace = "amazon-cloudwatch" + } +} +resource "null_resource" "validator" { + depends_on = [ + aws_eks_node_group.this, + kubernetes_daemonset.service, + kubernetes_cluster_role_binding.rolebinding, + kubernetes_service_account.cwagentservice, + ] + + provisioner "local-exec" { + command = <<-EOT + cd ../../../.. + i=0 + while [ $i -lt 10 ]; do + i=$((i+1)) + go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0 + sleep 60 + done + exit 1 + EOT + } +} \ No newline at end of file diff --git a/terraform/eks/daemon/kueue/providers.tf b/terraform/eks/daemon/kueue/providers.tf new file mode 100644 index 000000000..9bd2885f5 --- /dev/null +++ b/terraform/eks/daemon/kueue/providers.tf @@ -0,0 +1,17 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +provider "aws" { + region = var.region +} + +provider "kubernetes" { + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name] + } + host = aws_eks_cluster.this.endpoint + cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.this.token +} \ No newline at end of file diff --git a/terraform/eks/daemon/kueue/variables.tf b/terraform/eks/daemon/kueue/variables.tf new file mode 100644 index 000000000..dd54f92fb --- /dev/null +++ b/terraform/eks/daemon/kueue/variables.tf @@ -0,0 +1,32 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + // SPDX-License-Identifier: MIT + variable "region" { + type = string + default = "us-west-2" + } + variable "test_dir" { + type = string + default = "./test/kueue" + } + variable "cwagent_image_repo" { + type = string + default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent" + } + variable "cwagent_image_tag" { + type = string + default = "latest" + } + variable "k8s_version" { + type = string + default = "1.31" + } + + variable "ami_type" { + type = string + default = "AL2_x86_64" + } + + variable "instance_type" { + type = string + default = "m5.large" + } \ No newline at end of file diff --git a/test/awsneuron/resources/metrics_list.go b/test/awsneuron/resources/metrics_list.go index ce4094597..b679a5828 100644 --- a/test/awsneuron/resources/metrics_list.go +++ b/test/awsneuron/resources/metrics_list.go @@ -45,4 +45,4 @@ const ( NodeExecutionStatusFailedToQueue = "node_neuron_execution_status_failed_to_queue" NodeNeuronDeviceRuntimeMemoryUsed = "node_neurondevice_runtime_memory_used_bytes" NodeNeuronExecutionLatency = "node_neuron_execution_latency" -) +) \ No newline at end of file diff --git a/test/kueue/kueue-metrics-test.go b/test/kueue/kueue-metrics-test.go new file mode 100644 index 000000000..42f3d4ae2 --- /dev/null +++ b/test/kueue/kueue-metrics-test.go @@ -0,0 +1,94 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows + +package keu + +import ( + "time" + "github.com/aws/amazon-cloudwatch-agent-test/environment" + . "github.com/aws/amazon-cloudwatch-agent-test/test/kueue/resources" + "github.com/aws/amazon-cloudwatch-agent-test/test/metric" + "github.com/aws/amazon-cloudwatch-agent-test/test/status" + "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" +) + +const ( + awsKueueMetricIndicator = "_neuron" +) + +var expectedDimsToMetrics = map[string][]string{ + "ClusterName-ClusterQueue-Status": { + KueuePendingWorkloads, + }, + "ClusterName-ClusterQueue": { + KueuePendingWorkloads, + KueueEvictedWorkloadsTotal, + KueueAdmittedActiveWorkloads, + KueueClusterQueueResourceUsage, + }, + "ClusterName-Status": { + KueuePendingWorkloads, + }, + "ClusterName": { + KueuePendingWorkloads, + KueueEvictedWorkloadsTotal, + KueueAdmittedActiveWorkloads, + KueueClusterQueueResourceUsage, + KueueClusterQueueNominalQuota, + }, + "ClusterName-Reason": { + KueueEvictedWorkloadsTotal, + }, + "ClusterName-ClusterQueue-Reason": { + KueueEvictedWorkloadsTotal, + }, + "ClusterName-ClusterQueue-Resource-Flavor": { + KueueClusterQueueResourceUsage, + KueueClusterQueueNominalQuota, + }, + "ClusterName-ClusterQueue-Resource": { + KueueClusterQueueResourceUsage, + KueueClusterQueueNominalQuota, + }, + "ClusterName-ClusterQueue-Flavor": { + KueueClusterQueueResourceUsage, + KueueClusterQueueNominalQuota, + }, +} + +type AwsKueueTestRunner struct { + test_runner.BaseTestRunner + testName string + env *environment.MetaData +} + +var _ test_runner.ITestRunner = (*AwsKueueTestRunner)(nil) + +func (t *AwsKueueTestRunner) Validate() status.TestGroupResult { + var testResults []status.TestResult + testResults = append(testResults, metric.ValidateMetrics(t.env, awsKueueMetricIndicator, expectedDimsToMetrics)...) + testResults = append(testResults, metric.ValidateLogs(t.env)) + testResults = append(testResults, metric.ValidateLogsFrequency(t.env)) + return status.TestGroupResult{ + Name: t.GetTestName(), + TestResults: testResults, + } +} + +func (t *AwsKueueTestRunner) GetTestName() string { + return t.testName +} + +func (t *AwsKueueTestRunner) GetAgentConfigFileName() string { + return "" +} + +func (t *AwsKueueTestRunner) GetAgentRunDuration() time.Duration { + return 25 * time.Minute +} + +func (t *AwsKueueTestRunner) GetMeasuredMetrics() []string { + return nil +} diff --git a/test/kueue/resources/config.json b/test/kueue/resources/config.json new file mode 100644 index 000000000..8c4153be5 --- /dev/null +++ b/test/kueue/resources/config.json @@ -0,0 +1,17 @@ +{ + "agent": { + "metrics_collection_interval": 15, + "run_as_user": "root", + "debug": true, + "logfile": "" + }, + "logs": { + "metrics_collected": { + "kubernetes": { + "enhanced_container_insights": true, + "kueue_container_insights":true + } + }, + "force_flush_interval": 5 + } +} \ No newline at end of file diff --git a/test/kueue/resources/httpd.conf b/test/kueue/resources/httpd.conf new file mode 100644 index 000000000..058db5063 --- /dev/null +++ b/test/kueue/resources/httpd.conf @@ -0,0 +1,101 @@ + +ServerRoot "/usr/local/apache2" + +#Listen 9400 + +LoadModule mpm_event_module modules/mod_mpm_event.so +LoadModule authn_file_module modules/mod_authn_file.so +LoadModule authn_core_module modules/mod_authn_core.so +LoadModule authz_host_module modules/mod_authz_host.so +LoadModule authz_groupfile_module modules/mod_authz_groupfile.so +LoadModule authz_user_module modules/mod_authz_user.so +LoadModule authz_core_module modules/mod_authz_core.so +LoadModule access_compat_module modules/mod_access_compat.so +LoadModule auth_basic_module modules/mod_auth_basic.so +LoadModule socache_shmcb_module modules/mod_socache_shmcb.so +LoadModule reqtimeout_module modules/mod_reqtimeout.so +LoadModule filter_module modules/mod_filter.so +LoadModule mime_module modules/mod_mime.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule headers_module modules/mod_headers.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule version_module modules/mod_version.so +LoadModule ssl_module modules/mod_ssl.so +LoadModule unixd_module modules/mod_unixd.so +LoadModule status_module modules/mod_status.so +LoadModule autoindex_module modules/mod_autoindex.so +LoadModule dir_module modules/mod_dir.so +LoadModule alias_module modules/mod_alias.so + + +User www-data +Group www-data + + + + AllowOverride none + Require all denied + + +DocumentRoot "/usr/local/apache2/htdocs" + + Options Indexes FollowSymLinks + AllowOverride None + Require all granted + + + + DirectoryIndex index.html + + + + Require all denied + + +ErrorLog /proc/self/fd/2 + +LogLevel warn + + + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined + LogFormat "%h %l %u %t \"%r\" %>s %b" common + + + # You need to enable mod_logio.c to use %I and %O + LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio + + + CustomLog /proc/self/fd/1 common + + + + ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/" + + + + AllowOverride None + Options None + Require all granted + + + + RequestHeader unset Proxy early + + + + TypesConfig conf/mime.types + AddType application/x-compress .Z + AddType application/x-gzip .gz .tgz + + + +Include conf/extra/proxy-html.conf + + +# Secure (SSL/TLS) connections +Include conf/extra/httpd-ssl.conf + +SSLRandomSeed startup builtin +SSLRandomSeed connect builtin + \ No newline at end of file diff --git a/test/kueue/resources/metrics_list.go b/test/kueue/resources/metrics_list.go new file mode 100644 index 000000000..d2894d136 --- /dev/null +++ b/test/kueue/resources/metrics_list.go @@ -0,0 +1,12 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package resources + +const ( + KueuePendingWorkloads = "kueue_pending_workloads" + KueueEvictedWorkloadsTotal = "kueue_evicted_workloads_total" + KueueAdmittedActiveWorkloads = "kueue_admitted_active_workloads" + KueueClusterQueueResourceUsage = "kueue_cluster_queue_resource_usage" + KueueClusterQueueNominalQuota = "kueue_cluster_queue_nominal_quota" +) \ No newline at end of file diff --git a/test/metric/container_insights_util.go b/test/metric/container_insights_util.go index a61a3daf5..a05801e39 100644 --- a/test/metric/container_insights_util.go +++ b/test/metric/container_insights_util.go @@ -225,6 +225,41 @@ func ValidateLogs(env *environment.MetaData) status.TestResult { } } + // test for kueue metrics (the log group name is not a DNS address, it is kubernetes-kueue) + // ideally we only want to do this if kueue is enabled + if strings.EqualFold(env.EKSClusterName, "cwagent-eks-integ-kueue") { + err = awsservice.ValidateLogs( + group, + "kubernetes-kueue", + &start, + &end, + awsservice.AssertLogsNotEmpty(), + //awsservice.AssertNoDuplicateLogs(), + awsservice.AssertPerLog( + awsservice.AssertLogSchema(func(message string) (string, error) { + var eksClusterType awsservice.EKSClusterType + innerErr := json.Unmarshal([]byte(message), &eksClusterType) + if innerErr != nil { + return "", fmt.Errorf("failed to unmarshal log file: %w", innerErr) + } + + log.Printf("eksClusterType is: %s", eksClusterType.Type) + jsonSchema1, ok1 := eks_resources.EksClusterValidationMap["eksNodeKueueSchema"] + jsonSchema2, ok2 := eks_resources.EksClusterValidationMap["eksNodeKueueUsageSchema"] + if !(ok1 || ok2) { + return "", errors.New("invalid cluster type provided") + } + if (ok1) { + return jsonSchema1, nil + } else { + return jsonSchema2, nil + } + }), + awsservice.AssertLogContainsSubstring(fmt.Sprintf("\"ClusterName\":\"%s\"", env.EKSClusterName)), + ), + ) + } + testResult.Status = status.SUCCESSFUL return testResult } diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json new file mode 100644 index 000000000..1f65ef83a --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue.json @@ -0,0 +1,28 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "ClusterName": {}, + "ClusterQueue": {}, + "Flavor": {}, + "NodeName": {}, + "Resource": {}, + "Timestamp": {}, + "Version": {}, + "kueue_cluster_queue_nominal_quota": {}, + "kueue_cluster_queue_resource_usage": {} + }, + "required": [ + "ClusterName", + "ClusterQueue", + "Flavor", + "NodeName", + "Resource", + "Timestamp", + "Version", + "kueue_cluster_queue_nominal_quota", + "kueue_cluster_queue_resource_usage" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json new file mode 100644 index 000000000..e5c73217c --- /dev/null +++ b/test/metric_value_benchmark/eks_resources/test_schemas/cluster_kueue_usage.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "title": "structured log schema", + "description": "json schema for the cloudwatch agent k8s structured log", + "type": "object", + "properties": { + "ClusterName": {}, + "ClusterQueue": {}, + "NodeName": {}, + "Status": {}, + "Timestamp": {}, + "Version": {}, + "kueue_pending_workloads": {} + }, + "required": [ + "ClusterName", + "ClusterQueue", + "NodeName", + "Status", + "Timestamp", + "Version", + "kueue_pending_workloads" + ] +} \ No newline at end of file diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index bd737f85b..c8e1df081 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -56,6 +56,11 @@ var ( eksNodeNeuronDeviceSchema string //go:embed test_schemas/node_neuron.json eksNodeNeuronSchema string + //go:embed test_schemas/cluster_kueue.json + eksNodeKueueSchema string + //go:embed test_schemas/cluster_kueue_usage.json + eksNodeKueueUsageSchema string + EksClusterValidationMap = map[string]string{ "Cluster": eksClusterSchema, @@ -81,6 +86,8 @@ var ( "NodeAWSNeuronCore": eksNodeNeuronCoreSchema, "NodeAWSNeuronDevice": eksNodeNeuronDeviceSchema, "NodeAWSNeuron": eksNodeNeuronSchema, + "NodeKueue": eksNodeKueueSchema, + "NodeKueueUsage": eksNodeKueueUsageSchema, } EksClusterFrequencyValidationMap = map[string]int{ diff --git a/util/awsservice/eks.go b/util/awsservice/eks.go index c88578f08..0db5927df 100644 --- a/util/awsservice/eks.go +++ b/util/awsservice/eks.go @@ -46,3 +46,4 @@ func describeEksInstances(clusterName string) (*ec2.DescribeInstancesOutput, err }, }) } +