diff --git a/environment/metadata.go b/environment/metadata.go
index 7a0e1e965..128fa290d 100644
--- a/environment/metadata.go
+++ b/environment/metadata.go
@@ -96,7 +96,7 @@ func registerECSData(dataString *MetaDataStrings) {
func registerEKSData(d *MetaDataStrings) {
flag.StringVar(&(d.EKSClusterName), "eksClusterName", "", "EKS cluster name")
flag.StringVar(&(d.EksDeploymentStrategy), "eksDeploymentStrategy", "", "Daemon/Replica/Sidecar")
- flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/inferentia")
+ flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/awsneuron")
}
func registerPluginTestsToExecute(dataString *MetaDataStrings) {
diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go
index a9fee398f..5f9395ae7 100644
--- a/generator/test_case_generator.go
+++ b/generator/test_case_generator.go
@@ -219,6 +219,10 @@ var testTypeToTestConfig = map[string][]testConfig{
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
+ {
+ testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
+ targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
+ },
},
"eks_deployment": {
{testDir: "./test/metric_value_benchmark"},
diff --git a/terraform/eks/daemon/awsneuron/main.tf b/terraform/eks/daemon/awsneuron/main.tf
new file mode 100644
index 000000000..89668ea69
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/main.tf
@@ -0,0 +1,826 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+module "common" {
+ source = "../../../common"
+ cwagent_image_repo = var.cwagent_image_repo
+ cwagent_image_tag = var.cwagent_image_tag
+}
+
+module "basic_components" {
+ source = "../../../basic_components"
+
+ region = var.region
+}
+
+data "aws_eks_cluster_auth" "this" {
+ name = aws_eks_cluster.this.name
+}
+
+resource "aws_eks_cluster" "this" {
+ name = "cwagent-eks-integ-${module.common.testing_id}"
+ role_arn = module.basic_components.role_arn
+ version = var.k8s_version
+ enabled_cluster_log_types = [
+ "api",
+ "audit",
+ "authenticator",
+ "controllerManager",
+ "scheduler"
+ ]
+ vpc_config {
+ subnet_ids = module.basic_components.public_subnet_ids
+ security_group_ids = [module.basic_components.security_group]
+ }
+}
+
+# EKS Node Groups
+resource "aws_eks_node_group" "this" {
+ cluster_name = aws_eks_cluster.this.name
+ node_group_name = "cwagent-eks-integ-node"
+ node_role_arn = aws_iam_role.node_role.arn
+ subnet_ids = module.basic_components.public_subnet_ids
+
+ scaling_config {
+ desired_size = 1
+ max_size = 1
+ min_size = 1
+ }
+
+ ami_type = "AL2_x86_64"
+ capacity_type = "ON_DEMAND"
+ disk_size = 20
+ instance_types = ["t3.medium"]
+
+ depends_on = [
+ aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
+ aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
+ aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
+ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
+ ]
+}
+
+# EKS Node IAM Role
+resource "aws_iam_role" "node_role" {
+ name = "cwagent-eks-Worker-Role-${module.common.testing_id}"
+ assume_role_policy = jsonencode({
+ Version = "2012-10-17",
+ Statement = [
+ {
+ Effect = "Allow",
+ Principal = {
+ Service = "ec2.amazonaws.com"
+ },
+ Action = "sts:AssumeRole"
+ }
+ ]
+ })
+
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
+ policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
+ role = aws_iam_role.node_role.name
+}
+
+resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
+ policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
+ role = aws_iam_role.node_role.name
+}
+
+# TODO: these security groups be created once and then reused
+# EKS Cluster Security Group
+resource "aws_security_group" "eks_cluster_sg" {
+ name = "cwagent-eks-cluster-sg-${module.common.testing_id}"
+ description = "Cluster communication with worker nodes"
+ vpc_id = module.basic_components.vpc_id
+}
+
+resource "aws_security_group_rule" "cluster_inbound" {
+ description = "Allow worker nodes to communicate with the cluster API Server"
+ from_port = 443
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 443
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "cluster_outbound" {
+ description = "Allow cluster API Server to communicate with the worker nodes"
+ from_port = 1024
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_cluster_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "egress"
+}
+
+
+# EKS Node Security Group
+resource "aws_security_group" "eks_nodes_sg" {
+ name = "cwagent-eks-node-sg-${module.common.testing_id}"
+ description = "Security group for all nodes in the cluster"
+ vpc_id = module.basic_components.vpc_id
+
+ egress {
+ from_port = 0
+ to_port = 0
+ protocol = "-1"
+ cidr_blocks = ["0.0.0.0/0"]
+ }
+}
+
+resource "aws_security_group_rule" "nodes_internal" {
+ description = "Allow nodes to communicate with each other"
+ from_port = 0
+ protocol = "-1"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_nodes_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+resource "aws_security_group_rule" "nodes_cluster_inbound" {
+ description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
+ from_port = 1025
+ protocol = "tcp"
+ security_group_id = aws_security_group.eks_nodes_sg.id
+ source_security_group_id = aws_security_group.eks_cluster_sg.id
+ to_port = 65535
+ type = "ingress"
+}
+
+
+# create cert for communication between agent and neuron monitor
+resource "tls_private_key" "private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "ca_key" {
+ content = tls_private_key.private_key.private_key_pem
+ filename = "${path.module}/certs/ca.key"
+}
+
+resource "tls_self_signed_cert" "ca_cert" {
+ private_key_pem = tls_private_key.private_key.private_key_pem
+ is_ca_certificate = true
+ subject {
+ common_name = "neuron-monitor-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+ validity_period_hours = 24
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "cert_signing",
+ "crl_signing",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "ca_cert_file" {
+ content = tls_self_signed_cert.ca_cert.cert_pem
+ filename = "${path.module}/certs/ca.cert"
+}
+
+resource "tls_private_key" "server_private_key" {
+ algorithm = "RSA"
+}
+
+resource "local_file" "server_key" {
+ content = tls_private_key.server_private_key.private_key_pem
+ filename = "${path.module}/certs/server.key"
+}
+
+resource "tls_cert_request" "local_csr" {
+ private_key_pem = tls_private_key.server_private_key.private_key_pem
+ dns_names = ["localhost", "127.0.0.1", "neuron-monitor-service.amazon-cloudwatch.svc"]
+ subject {
+ common_name = "neuron-monitor-service.amazon-cloudwatch.svc"
+ organization = "Amazon CloudWatch Agent"
+ }
+}
+
+resource "tls_locally_signed_cert" "server_cert" {
+ cert_request_pem = tls_cert_request.local_csr.cert_request_pem
+ ca_private_key_pem = tls_private_key.private_key.private_key_pem
+ ca_cert_pem = tls_self_signed_cert.ca_cert.cert_pem
+ validity_period_hours = 12
+ allowed_uses = [
+ "digital_signature",
+ "key_encipherment",
+ "server_auth",
+ "client_auth",
+ ]
+}
+
+resource "local_file" "server_cert_file" {
+ content = tls_locally_signed_cert.server_cert.cert_pem
+ filename = "${path.module}/certs/server.cert"
+}
+
+resource "kubernetes_secret" "agent_cert" {
+ metadata {
+ name = "amazon-cloudwatch-observability-agent-cert"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "ca.crt" = tls_self_signed_cert.ca_cert.cert_pem #filebase64(local_file.ca_cert_file.filename)
+ "tls.crt" = tls_locally_signed_cert.server_cert.cert_pem #filebase64(local_file.server_cert_file.filename)
+ "tls.key" = tls_private_key.server_private_key.private_key_pem #filebase64(local_file.server_key.filename)
+ }
+}
+
+
+resource "kubernetes_namespace" "namespace" {
+ metadata {
+ name = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_config_map" "neuron_monitor_config_map" {
+ depends_on = [
+ kubernetes_namespace.namespace
+ ]
+
+ metadata {
+ name = "neuron-monitor-config-map"
+ namespace = "amazon-cloudwatch"
+ }
+
+ data = {
+ "monitor.json" = jsonencode({
+ period = "5s"
+ neuron_runtimes = [
+ {
+ tag_filter : ".*"
+ metrics = [
+ {
+ type = "neuroncore_counters"
+ },
+ {
+ type = "memory_used"
+ },
+ {
+ type = "neuron_runtime_vcpu_usage"
+ },
+ {
+ type = "execution_stats"
+ }
+ ]
+ }
+ ]
+ system_metrics = [
+ {
+ type = "memory_info"
+ },
+ {
+ period = "5s"
+ type = "neuron_hw_counters"
+ }
+ ]
+ })
+ }
+}
+
+resource "kubernetes_service_account" "neuron_monitor_service_account" {
+ depends_on = [
+ kubernetes_namespace.namespace
+ ]
+ metadata {
+ name = "neuron-monitor-service-acct"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_role" "neuron_monitor_role" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_config_map.neuron_monitor_config_map
+ ]
+ metadata {
+ name = "neuron-monitor-role"
+ namespace = "amazon-cloudwatch"
+ }
+
+ rule {
+ api_groups = [""]
+ resources = ["configmaps"]
+ resource_names = ["neuron-monitor-config-map"]
+ verbs = ["get"]
+ }
+}
+
+resource "kubernetes_role_binding" "neuron_monitor_role_binding" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_role.neuron_monitor_role
+ ]
+
+ metadata {
+ namespace = "amazon-cloudwatch"
+ name = "neuron-monitor-role-binding"
+ }
+
+ role_ref {
+ kind = "Role"
+ name = "neuron-monitor-role"
+ api_group = "rbac.authorization.k8s.io"
+ }
+
+ subject {
+ kind = "ServiceAccount"
+ name = "neuron-monitor-service-acct"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_daemonset" "neuron_monitor" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.neuron_monitor_service_account,
+ kubernetes_role.neuron_monitor_role,
+ kubernetes_role_binding.neuron_monitor_role_binding,
+ kubernetes_config_map.neuron_monitor_config_map
+ ]
+
+ metadata {
+ name = "neuron-monitor"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ k8s-app = "neuron-monitor"
+ version = "v1"
+ }
+ }
+ spec {
+ selector {
+ match_labels = {
+ k8s-app = "neuron-monitor"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ k8s-app = "neuron-monitor"
+ version = "v1"
+ }
+ }
+ spec {
+ affinity {
+ node_affinity {
+ required_during_scheduling_ignored_during_execution {
+ node_selector_term {
+ match_expressions {
+ key = "kubernetes.io/os"
+ operator = "In"
+ values = ["linux"]
+ }
+ }
+ }
+ }
+ }
+ container {
+ name = "neuron-monitor-prometheus"
+ image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:latest"
+ port {
+ container_port = 8000
+ }
+ command = [
+ "/bin/sh",
+ "-c",
+ "/opt/aws/neuron/bin/dummy_neuron_monitor.py --port 8000 --cert-file /etc/amazon-cloudwatch-observability-neuron-cert/server.crt --key-file /etc/amazon-cloudwatch-observability-neuron-cert/server.key"
+ ]
+ resources {
+ limits = {
+ cpu = "500m"
+ memory = "256Mi"
+ }
+ requests = {
+ cpu = "256m"
+ memory = "128Mi"
+ }
+ }
+ security_context {
+ privileged = true
+ }
+ env {
+ name = "NODE_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "PATH"
+ value = "/usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin"
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-neuron-cert/"
+ name = "neurontls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/neuron-monitor-config/"
+ name = "neuron-monitor-config"
+ read_only = true
+ }
+ }
+ volume {
+ name = "neurontls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "tls.crt"
+ path = "server.crt"
+ }
+ items {
+ key = "tls.key"
+ path = "server.key"
+ }
+ }
+ }
+ volume {
+ name = "neuron-monitor-config"
+ config_map {
+ name = "neuron-monitor-config-map"
+ }
+ }
+ service_account_name = "neuron-monitor-service-acct"
+ }
+ }
+ }
+}
+
+resource "kubernetes_service" "neuron_monitor_service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.neuron_monitor
+ ]
+ metadata {
+ name = "neuron-monitor-service"
+ namespace = "amazon-cloudwatch"
+ labels = {
+ "k8s-app" : "neuron-monitor-service"
+ }
+ annotations = {
+ "prometheus.io/scrape" : "true"
+ }
+ }
+ spec {
+ type = "ClusterIP"
+ selector = {
+ k8s-app = "neuron-monitor"
+ }
+ port {
+ name = "metrics"
+ port = 8000
+ target_port = 8000
+ protocol = "TCP"
+ }
+ internal_traffic_policy = "Local"
+ }
+}
+
+resource "kubernetes_daemonset" "service" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice,
+ aws_eks_node_group.this,
+ kubernetes_daemonset.neuron_monitor
+ ]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+ spec {
+ selector {
+ match_labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ template {
+ metadata {
+ labels = {
+ "name" : "cloudwatch-agent"
+ }
+ }
+ spec {
+ node_selector = {
+ "kubernetes.io/os" : "linux"
+ }
+ container {
+ name = "cwagent"
+ image = "${var.cwagent_image_repo}:${var.cwagent_image_tag}"
+ image_pull_policy = "Always"
+ resources {
+ limits = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ requests = {
+ "cpu" : "200m",
+ "memory" : "200Mi"
+ }
+ }
+ port {
+ container_port = 25888
+ host_port = 25888
+ protocol = "UDP"
+ }
+ env {
+ name = "HOST_IP"
+ value_from {
+ field_ref {
+ field_path = "status.hostIP"
+ }
+ }
+ }
+ env {
+ name = "HOST_NAME"
+ value_from {
+ field_ref {
+ field_path = "spec.nodeName"
+ }
+ }
+ }
+ env {
+ name = "K8S_NAMESPACE"
+ value_from {
+ field_ref {
+ field_path = "metadata.namespace"
+ }
+ }
+ }
+ volume_mount {
+ mount_path = "/etc/cwagentconfig"
+ name = "cwagentconfig"
+ }
+ volume_mount {
+ mount_path = "/rootfs"
+ name = "rootfs"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/run/docker.sock"
+ name = "dockersock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/docker"
+ name = "varlibdocker"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/run/containerd/containerd.sock"
+ name = "containerdsock"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/sys"
+ name = "sys"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/dev/disk"
+ name = "devdisk"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/etc/amazon-cloudwatch-observability-agent-cert"
+ name = "agenttls"
+ read_only = true
+ }
+ volume_mount {
+ mount_path = "/var/lib/kubelet/pod-resources"
+ name = "kubelet-podresources"
+ read_only = true
+ }
+ }
+ volume {
+ name = "cwagentconfig"
+ config_map {
+ name = "cwagentconfig"
+ }
+ }
+ volume {
+ name = "rootfs"
+ host_path {
+ path = "/"
+ }
+ }
+ volume {
+ name = "dockersock"
+ host_path {
+ path = "/var/run/docker.sock"
+ }
+ }
+ volume {
+ name = "varlibdocker"
+ host_path {
+ path = "/var/lib/docker"
+ }
+ }
+ volume {
+ name = "containerdsock"
+ host_path {
+ path = "/run/containerd/containerd.sock"
+ }
+ }
+ volume {
+ name = "sys"
+ host_path {
+ path = "/sys"
+ }
+ }
+ volume {
+ name = "devdisk"
+ host_path {
+ path = "/dev/disk"
+ }
+ }
+ volume {
+ name = "kubelet-podresources"
+ host_path {
+ path = "/var/lib/kubelet/pod-resources"
+ }
+ }
+ volume {
+ name = "agenttls"
+ secret {
+ secret_name = "amazon-cloudwatch-observability-agent-cert"
+ items {
+ key = "ca.crt"
+ path = "tls-ca.crt"
+ }
+ }
+ }
+ service_account_name = "cloudwatch-agent"
+ termination_grace_period_seconds = 60
+ }
+ }
+ }
+}
+
+##########################################
+# Template Files
+##########################################
+locals {
+ httpd_config = "../../../../${var.test_dir}/resources/httpd.conf"
+ httpd_ssl_config = "../../../../${var.test_dir}/resources/httpd-ssl.conf"
+ cwagent_config = fileexists("../../../../${var.test_dir}/resources/config.json") ? "../../../../${var.test_dir}/resources/config.json" : "../default_resources/default_amazon_cloudwatch_agent.json"
+}
+
+data "template_file" "cwagent_config" {
+ template = file(local.cwagent_config)
+ vars = {
+ }
+}
+
+resource "kubernetes_config_map" "cwagentconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "cwagentconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "cwagentconfig.json" : data.template_file.cwagent_config.rendered
+ }
+}
+
+data "template_file" "httpd_config" {
+ template = file(local.httpd_config)
+ vars = {}
+}
+data "template_file" "httpd_ssl_config" {
+ template = file(local.httpd_ssl_config)
+ vars = {}
+}
+
+resource "kubernetes_config_map" "httpdconfig" {
+ depends_on = [
+ kubernetes_namespace.namespace,
+ kubernetes_service_account.cwagentservice
+ ]
+ metadata {
+ name = "httpdconfig"
+ namespace = "amazon-cloudwatch"
+ }
+ data = {
+ "httpd.conf" : data.template_file.httpd_config.rendered
+ "httpd-ssl.conf" : data.template_file.httpd_ssl_config.rendered
+ }
+}
+
+resource "kubernetes_service_account" "cwagentservice" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "kubernetes_cluster_role" "clusterrole" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role"
+ }
+ rule {
+ verbs = ["get", "list", "watch"]
+ resources = ["pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["replicasets"]
+ api_groups = ["apps"]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["jobs"]
+ api_groups = ["batch"]
+ }
+ rule {
+ verbs = ["get"]
+ resources = ["nodes/proxy"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["create"]
+ resources = ["nodes/stats", "configmaps", "events"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get", "update"]
+ resource_names = ["cwagent-clusterleader"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["get"]
+ resource_names = ["neuron-monitor-config-map"]
+ resources = ["configmaps"]
+ api_groups = [""]
+ }
+ rule {
+ verbs = ["list", "watch"]
+ resources = ["services"]
+ api_groups = [""]
+ }
+ rule {
+ non_resource_urls = ["/metrics"]
+ verbs = ["get", "list", "watch"]
+ }
+}
+
+resource "kubernetes_cluster_role_binding" "rolebinding" {
+ depends_on = [kubernetes_namespace.namespace]
+ metadata {
+ name = "cloudwatch-agent-role-binding"
+ }
+ role_ref {
+ api_group = "rbac.authorization.k8s.io"
+ kind = "ClusterRole"
+ name = "cloudwatch-agent-role"
+ }
+ subject {
+ kind = "ServiceAccount"
+ name = "cloudwatch-agent"
+ namespace = "amazon-cloudwatch"
+ }
+}
+
+resource "null_resource" "validator" {
+ depends_on = [
+ aws_eks_node_group.this,
+ kubernetes_daemonset.service,
+ kubernetes_cluster_role_binding.rolebinding,
+ kubernetes_service_account.cwagentservice,
+ ]
+ provisioner "local-exec" {
+ command = <<-EOT
+ echo "Validating EKS metrics/logs for EMF"
+ cd ../../../..
+ go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=awsneuron
+ EOT
+ }
+}
diff --git a/terraform/eks/daemon/awsneuron/providers.tf b/terraform/eks/daemon/awsneuron/providers.tf
new file mode 100644
index 000000000..9bd2885f5
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/providers.tf
@@ -0,0 +1,17 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+provider "aws" {
+ region = var.region
+}
+
+provider "kubernetes" {
+ exec {
+ api_version = "client.authentication.k8s.io/v1beta1"
+ command = "aws"
+ args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
+ }
+ host = aws_eks_cluster.this.endpoint
+ cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
+ token = data.aws_eks_cluster_auth.this.token
+}
\ No newline at end of file
diff --git a/terraform/eks/daemon/awsneuron/variables.tf b/terraform/eks/daemon/awsneuron/variables.tf
new file mode 100644
index 000000000..11d3f87c2
--- /dev/null
+++ b/terraform/eks/daemon/awsneuron/variables.tf
@@ -0,0 +1,37 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+variable "region" {
+ type = string
+ default = "us-west-2"
+}
+
+variable "test_dir" {
+ type = string
+ default = "./test/awsneuron"
+}
+
+variable "cwagent_image_repo" {
+ type = string
+ default = "public.ecr.aws/cloudwatch-agent/cloudwatch-agent"
+}
+
+variable "cwagent_image_tag" {
+ type = string
+ default = "latest"
+}
+
+variable "k8s_version" {
+ type = string
+ default = "1.28"
+}
+
+variable "ami_type" {
+ type = string
+ default = "AL2_x86_64"
+}
+
+variable "instance_type" {
+ type = string
+ default = "trn1.2xlarge"
+}
\ No newline at end of file
diff --git a/test/awsneuron/neuron_metrics_test.go b/test/awsneuron/neuron_metrics_test.go
new file mode 100644
index 000000000..85e85e3c7
--- /dev/null
+++ b/test/awsneuron/neuron_metrics_test.go
@@ -0,0 +1,74 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+ "time"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ . "github.com/aws/amazon-cloudwatch-agent-test/test/awsneuron/resources"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+const (
+ awsNeuronMetricIndicator = "_neuron"
+)
+
+var expectedDimsToMetrics = map[string][]string{
+ "ClusterName": {
+ NodeNeuronCoreUtil, NodeNeuronCoreMemUsageConstants, NodeNeuronCoreMemUsageModel, NodeNeuronCoreMemUsageScratchpad,
+ NodeNeuronCoreMemUsageRuntime, NodeNeuronCoreMemUsageTensors, NodeNeuronCoreMemUsageTotal, NodeNeuronDeviceHwEccEvents,
+ NodeExecutionErrorsTotal, NodeNeuronDeviceRuntimeMemoryUsed, NodeNeuronExecutionLatency,
+ },
+ "ClusterName-InstanceId-NodeName": {
+ NodeNeuronCoreUtil, NodeNeuronCoreMemUsageConstants, NodeNeuronCoreMemUsageModel, NodeNeuronCoreMemUsageScratchpad,
+ NodeNeuronCoreMemUsageRuntime, NodeNeuronCoreMemUsageTensors, NodeNeuronCoreMemUsageTotal, NodeNeuronDeviceHwEccEvents,
+ NodeExecutionErrorsTotal, NodeNeuronDeviceRuntimeMemoryUsed, NodeNeuronExecutionLatency,
+ },
+ "ClusterName-InstanceId-NodeName-NeuronDevice": {
+ NodeNeuronDeviceHwEccEvents,
+ },
+ "ClusterName-InstanceId-NodeName-NeuronDevice-NeuronCore-InstanceType": {
+ NodeNeuronCoreUtil, NodeNeuronCoreMemUsageConstants, NodeNeuronCoreMemUsageModel, NodeNeuronCoreMemUsageScratchpad,
+ NodeNeuronCoreMemUsageRuntime, NodeNeuronCoreMemUsageTensors, NodeNeuronCoreMemUsageTotal,
+ },
+}
+
+type AwsNeuronTestRunner struct {
+ test_runner.BaseTestRunner
+ testName string
+ env *environment.MetaData
+}
+
+var _ test_runner.ITestRunner = (*AwsNeuronTestRunner)(nil)
+
+func (t *AwsNeuronTestRunner) Validate() status.TestGroupResult {
+ var testResults []status.TestResult
+ testResults = append(testResults, metric.ValidateMetrics(t.env, awsNeuronMetricIndicator, expectedDimsToMetrics)...)
+ testResults = append(testResults, metric.ValidateLogs(t.env))
+ return status.TestGroupResult{
+ Name: t.GetTestName(),
+ TestResults: testResults,
+ }
+}
+
+func (t *AwsNeuronTestRunner) GetTestName() string {
+ return t.testName
+}
+
+func (t *AwsNeuronTestRunner) GetAgentConfigFileName() string {
+ return ""
+}
+
+func (t *AwsNeuronTestRunner) GetAgentRunDuration() time.Duration {
+ return 3 * time.Minute
+}
+
+func (t *AwsNeuronTestRunner) GetMeasuredMetrics() []string {
+ return nil
+}
diff --git a/test/awsneuron/neuron_test.go b/test/awsneuron/neuron_test.go
new file mode 100644
index 000000000..81e0352df
--- /dev/null
+++ b/test/awsneuron/neuron_test.go
@@ -0,0 +1,78 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+//go:build !windows
+
+package emf
+
+import (
+ "fmt"
+ "github.com/stretchr/testify/suite"
+ "log"
+ "testing"
+
+ "github.com/aws/amazon-cloudwatch-agent-test/environment"
+ "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/status"
+ "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
+)
+
+type AwsNeuronTestSuite struct {
+ suite.Suite
+ test_runner.TestSuite
+}
+
+func (suite *AwsNeuronTestSuite) SetupSuite() {
+ fmt.Println(">>>> Starting AWS Neuron Container Insights TestSuite")
+}
+
+func (suite *AwsNeuronTestSuite) TearDownSuite() {
+ suite.Result.Print()
+ fmt.Println(">>>> Finished AWS Neuron Container Insights TestSuite")
+}
+
+func init() {
+ environment.RegisterEnvironmentMetaDataFlags()
+}
+
+var (
+ eksTestRunners []*test_runner.EKSTestRunner
+)
+
+func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner {
+ if eksTestRunners == nil {
+ factory := dimension.GetDimensionFactory(*env)
+
+ eksTestRunners = []*test_runner.EKSTestRunner{
+ {
+ Runner: &AwsNeuronTestRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, "EKS_AWS_NEURON", env},
+ Env: *env,
+ },
+ }
+ }
+ return eksTestRunners
+}
+
+func (suite *AwsNeuronTestSuite) TestAllInSuite() {
+ env := environment.GetEnvironmentMetaData()
+ switch env.ComputeType {
+ case computetype.EKS:
+ log.Println("Environment compute type is EKS")
+ for _, testRunner := range getEksTestRunners(env) {
+ testRunner.Run(suite, env)
+ }
+ default:
+ return
+ }
+
+ suite.Assert().Equal(status.SUCCESSFUL, suite.Result.GetStatus(), "AWS Neuron Container Test Suite Failed")
+}
+
+func (suite *AwsNeuronTestSuite) AddToSuiteResult(r status.TestGroupResult) {
+ suite.Result.TestGroupResults = append(suite.Result.TestGroupResults, r)
+}
+
+func TestAWSNeuronSuite(t *testing.T) {
+ suite.Run(t, new(AwsNeuronTestSuite))
+}
diff --git a/test/awsneuron/resources/config.json b/test/awsneuron/resources/config.json
new file mode 100644
index 000000000..6f37e43ed
--- /dev/null
+++ b/test/awsneuron/resources/config.json
@@ -0,0 +1,16 @@
+{
+ "agent": {
+ "metrics_collection_interval": 15,
+ "run_as_user": "root",
+ "debug": true,
+ "logfile": ""
+ },
+ "logs": {
+ "metrics_collected": {
+ "kubernetes": {
+ "enhanced_container_insights": true
+ }
+ },
+ "force_flush_interval": 5
+ }
+}
\ No newline at end of file
diff --git a/test/awsneuron/resources/httpd-ssl.conf b/test/awsneuron/resources/httpd-ssl.conf
new file mode 100644
index 000000000..18c33f0bd
--- /dev/null
+++ b/test/awsneuron/resources/httpd-ssl.conf
@@ -0,0 +1,43 @@
+Listen 8000
+
+SSLCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+SSLProxyCipherSuite HIGH:MEDIUM:!MD5:!RC4:!3DES
+
+SSLHonorCipherOrder on
+
+SSLProtocol all -SSLv3
+SSLProxyProtocol all -SSLv3
+
+SSLPassPhraseDialog builtin
+
+SSLSessionCache "shmcb:/usr/local/apache2/logs/ssl_scache(512000)"
+SSLSessionCacheTimeout 300
+
+
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+ServerName neuron-monitor-service.amazon-cloudwatch.svc:8000
+ServerAdmin you@example.com
+ErrorLog /proc/self/fd/2
+TransferLog /proc/self/fd/1
+
+SSLEngine on
+SSLCertificateFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt"
+SSLCertificateKeyFile "/etc/amazon-cloudwatch-observability-neuron-cert/server.key"
+
+
+ SSLOptions +StdEnvVars
+
+
+ SSLOptions +StdEnvVars
+
+
+BrowserMatch "MSIE [2-5]" \
+ nokeepalive ssl-unclean-shutdown \
+ downgrade-1.0 force-response-1.0
+
+CustomLog /proc/self/fd/1 \
+ "%t %h %%{SSL_PROTOCOL}x $%{SSL_CIPHER}x \"%r\" %b"
+
+
\ No newline at end of file
diff --git a/test/awsneuron/resources/httpd.conf b/test/awsneuron/resources/httpd.conf
new file mode 100644
index 000000000..122b16b17
--- /dev/null
+++ b/test/awsneuron/resources/httpd.conf
@@ -0,0 +1,101 @@
+
+ServerRoot "/usr/local/apache2"
+
+#Listen 8000
+
+LoadModule mpm_event_module modules/mod_mpm_event.so
+LoadModule authn_file_module modules/mod_authn_file.so
+LoadModule authn_core_module modules/mod_authn_core.so
+LoadModule authz_host_module modules/mod_authz_host.so
+LoadModule authz_groupfile_module modules/mod_authz_groupfile.so
+LoadModule authz_user_module modules/mod_authz_user.so
+LoadModule authz_core_module modules/mod_authz_core.so
+LoadModule access_compat_module modules/mod_access_compat.so
+LoadModule auth_basic_module modules/mod_auth_basic.so
+LoadModule socache_shmcb_module modules/mod_socache_shmcb.so
+LoadModule reqtimeout_module modules/mod_reqtimeout.so
+LoadModule filter_module modules/mod_filter.so
+LoadModule mime_module modules/mod_mime.so
+LoadModule log_config_module modules/mod_log_config.so
+LoadModule env_module modules/mod_env.so
+LoadModule headers_module modules/mod_headers.so
+LoadModule setenvif_module modules/mod_setenvif.so
+LoadModule version_module modules/mod_version.so
+LoadModule ssl_module modules/mod_ssl.so
+LoadModule unixd_module modules/mod_unixd.so
+LoadModule status_module modules/mod_status.so
+LoadModule autoindex_module modules/mod_autoindex.so
+LoadModule dir_module modules/mod_dir.so
+LoadModule alias_module modules/mod_alias.so
+
+
+User www-data
+Group www-data
+
+
+
+ AllowOverride none
+ Require all denied
+
+
+DocumentRoot "/usr/local/apache2/htdocs"
+
+ Options Indexes FollowSymLinks
+ AllowOverride None
+ Require all granted
+
+
+
+ DirectoryIndex index.html
+
+
+
+ Require all denied
+
+
+ErrorLog /proc/self/fd/2
+
+LogLevel warn
+
+
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\"" combined
+ LogFormat "%h %l %u %t \"%r\" %>s %b" common
+
+
+ # You need to enable mod_logio.c to use %I and %O
+ LogFormat "%h %l %u %t \"%r\" %>s %b \"%%{Referer}i\" \"%%{User-Agent}i\" %I %O" combinedio
+
+
+ CustomLog /proc/self/fd/1 common
+
+
+
+ ScriptAlias /cgi-bin/ "/usr/local/apache2/cgi-bin/"
+
+
+
+ AllowOverride None
+ Options None
+ Require all granted
+
+
+
+ RequestHeader unset Proxy early
+
+
+
+ TypesConfig conf/mime.types
+ AddType application/x-compress .Z
+ AddType application/x-gzip .gz .tgz
+
+
+
+Include conf/extra/proxy-html.conf
+
+
+# Secure (SSL/TLS) connections
+Include conf/extra/httpd-ssl.conf
+
+SSLRandomSeed startup builtin
+SSLRandomSeed connect builtin
+
\ No newline at end of file
diff --git a/test/awsneuron/resources/metrics_list.go b/test/awsneuron/resources/metrics_list.go
new file mode 100644
index 000000000..ce4094597
--- /dev/null
+++ b/test/awsneuron/resources/metrics_list.go
@@ -0,0 +1,48 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package resources
+
+const (
+ ContainerNeuronCoreUtil = "container_neuroncore_utilization"
+ ContainerNeuronCoreMemUsageConstants = "container_neuroncore_memory_usage_constants"
+ ContainerNeuronCoreMemUsageModel = "container_neuroncore_memory_usage_model_code"
+ ContainerNeuronCoreMemUsageScratchpad = "container_neuroncore_memory_usage_model_shared_scratchpad"
+ ContainerNeuronCoreMemUsageRuntime = "container_neuroncore_memory_usage_runtime_memory"
+ ContainerNeuronCoreMemUsageTensors = "container_neuroncore_memory_usage_tensors"
+ ContainerNeuronCoreMemUsageTotal = "container_neuroncore_memory_usage_total"
+ ContainerNeuronDeviceHwEccEvents = "container_neurondevice_hw_ecc_events_total"
+
+ PodNeuronCoreUtil = "pod_neuroncore_utilization"
+ PodNeuronCoreMemUsageConstants = "pod_neuroncore_memory_usage_constants"
+ PodNeuronCoreMemUsageModel = "pod_neuroncore_memory_usage_model_code"
+ PodNeuronCoreMemUsageScratchpad = "pod_neuroncore_memory_usage_model_shared_scratchpad"
+ PodNeuronCoreMemUsageRuntime = "pod_neuroncore_memory_usage_runtime_memory"
+ PodNeuronCoreMemUsageTensors = "pod_neuroncore_memory_usage_tensors"
+ PodNeuronCoreMemUsageTotal = "pod_neuroncore_memory_usage_total"
+ PodNeuronDeviceHwEccEvents = "pod_neurondevice_hw_ecc_events_total"
+
+ NodeNeuronCoreUtil = "node_neuroncore_utilization"
+ NodeNeuronCoreMemUsageConstants = "node_neuroncore_memory_usage_constants"
+ NodeNeuronCoreMemUsageModel = "node_neuroncore_memory_usage_model_code"
+ NodeNeuronCoreMemUsageScratchpad = "node_neuroncore_memory_usage_model_shared_scratchpad"
+ NodeNeuronCoreMemUsageRuntime = "node_neuroncore_memory_usage_runtime_memory"
+ NodeNeuronCoreMemUsageTensors = "node_neuroncore_memory_usage_tensors"
+ NodeNeuronCoreMemUsageTotal = "node_neuroncore_memory_usage_total"
+ NodeNeuronDeviceHwEccEvents = "node_neurondevice_hw_ecc_events_total"
+ NodeExecutionErrorsTotal = "node_neuron_execution_errors_total"
+ NodeExecutionErrorsGeneric = "node_neuron_execution_errors_generic"
+ NodeExecutionErrorsNumerical = "node_neuron_execution_errors_numerical"
+ NodeExecutionErrorsTransient = "node_neuron_execution_errors_transient"
+ NodeExecutionErrorsModel = "node_neuron_execution_errors_model"
+ NodeExecutionErrorsRuntime = "node_neuron_execution_errors_runtime"
+ NodeExecutionErrorsHardware = "node_neuron_execution_errors_hardware"
+ NodeExecutionStatusCompleted = "node_neuron_execution_status_completed"
+ NodeExecutionStatusTimedOut = "node_neuron_execution_status_timed_out"
+ NodeExecutionStatusCompletedWithErr = "node_neuron_execution_status_completed_with_err"
+ NodeExecutionStatusCompletedWithNumErr = "node_neuron_execution_status_completed_with_num_err"
+ NodeExecutionStatusIncorrectInput = "node_neuron_execution_status_incorrect_input"
+ NodeExecutionStatusFailedToQueue = "node_neuron_execution_status_failed_to_queue"
+ NodeNeuronDeviceRuntimeMemoryUsed = "node_neurondevice_runtime_memory_used_bytes"
+ NodeNeuronExecutionLatency = "node_neuron_execution_latency"
+)
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json
new file mode 100644
index 000000000..049396574
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuron.json
@@ -0,0 +1,42 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "NodeName": {},
+ "OTelLib": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "http.scheme": {},
+ "k8s.namespace.name": {},
+ "kubernetes": {},
+ "net.host.name": {},
+ "net.host.port": {},
+ "region": {},
+ "runtime_tag": {},
+ "service.instance.id": {},
+ "service.name": {},
+ "subnet_id": {},
+ "node_neuron_execution_errors_total": {},
+ "node_neuron_execution_latency": {},
+ "node_neurondevice_runtime_memory_used_bytes": {},
+ },
+ "required": [
+ "ClusterName",
+ "InstanceId",
+ "InstanceType",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics",
+ "runtime_tag"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json
new file mode 100644
index 000000000..6c37ead8f
--- /dev/null
+++ b/test/metric_value_benchmark/eks_resources/test_schemas/node_neuroncore.json
@@ -0,0 +1,56 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "title": "structured log schema",
+ "description": "json schema for the cloudwatch agent k8s structured log",
+ "type": "object",
+ "properties": {
+ "CloudWatchMetrics": {},
+ "ClusterName": {},
+ "ContainerName": {},
+ "FullPodName": {},
+ "InstanceId": {},
+ "InstanceType": {},
+ "K8sPodName": {},
+ "Namespace": {},
+ "NeuronCore": {},
+ "NeuronDevice": {},
+ "NodeName": {},
+ "OTelLib": {},
+ "PodName": {},
+ "Timestamp": {},
+ "Type": {},
+ "Version": {},
+ "availability_zone": {},
+ "http.scheme": {},
+ "k8s.namespace.name": {},
+ "kubernetes": {},
+ "net.host.name": {},
+ "net.host.port": {},
+ "region": {},
+ "runtime_tag": {},
+ "service.instance.id": {},
+ "service.name": {},
+ "subnet_id": {},
+ "node_neuroncore_memory_usage_constants": {},
+ "node_neuroncore_memory_usage_model_code": {},
+ "node_neuroncore_memory_usage_model_shared_scratchpad": {},
+ "node_neuroncore_memory_usage_runtime_memory": {},
+ "node_neuroncore_memory_usage_tensors": {},
+ "node_neuroncore_memory_usage_total": {},
+ "node_neuroncore_utilization": {}
+ },
+ "required": [
+ "ClusterName",
+ "InstanceId",
+ "InstanceType",
+ "Namespace",
+ "NeuronCore",
+ "NeuronDevice",
+ "NodeName",
+ "Timestamp",
+ "Type",
+ "Version",
+ "CloudWatchMetrics",
+ "runtime_tag"
+ ]
+}
\ No newline at end of file
diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go
index bff8892a7..65c89a6ce 100644
--- a/test/metric_value_benchmark/eks_resources/util.go
+++ b/test/metric_value_benchmark/eks_resources/util.go
@@ -46,6 +46,10 @@ var (
eksNodeGpuSchema string
//go:embed test_schemas/cluster_gpu.json
eksClusterGpuSchema string
+ //go:embed test_schemas/node_neuroncore.json
+ eksNodeNeuronCoreSchema string
+ //go:embed test_schemas/node_neuron.json
+ eksNodeNeuronSchema string
EksClusterValidationMap = map[string]string{
"Cluster": eksClusterSchema,
@@ -66,6 +70,8 @@ var (
"PodGPU": eksPodGpuSchema,
"NodeGPU": eksNodeGpuSchema,
"ClusterGPU": eksClusterGpuSchema,
+ "NodeNeuronCore": eksNodeNeuronCoreSchema,
+ "NodeNeuron": eksNodeNeuronSchema,
}
)