diff --git a/.gitignore b/.gitignore index ed0aa65ed..214284244 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ terraform.* **/.terraform/* /test/**/final_*.yml coverage.txt -generator/resources/*complete*.json \ No newline at end of file +generator/resources/*complete*.json +terraform/eks/e2e/helm-charts \ No newline at end of file diff --git a/environment/metadata.go b/environment/metadata.go index 2cf99ad48..f7c03fc77 100644 --- a/environment/metadata.go +++ b/environment/metadata.go @@ -47,6 +47,7 @@ type MetaData struct { AmpWorkspaceId string Region string K8sVersion string + Destroy bool HelmChartsBranch string CloudwatchAgentRepository string CloudwatchAgentTag string @@ -87,6 +88,7 @@ type MetaDataStrings struct { AmpWorkspaceId string Region string K8sVersion string + Destroy string HelmChartsBranch string CloudwatchAgentRepository string CloudwatchAgentTag string @@ -136,6 +138,7 @@ func registerEKSData(d *MetaDataStrings) { func registerEKSE2ETestData(dataString *MetaDataStrings) { flag.StringVar(&(dataString.Region), "region", "", "AWS region") flag.StringVar(&(dataString.K8sVersion), "k8s_version", "", "Kubernetes version") + flag.StringVar(&(dataString.Destroy), "destroy", "false", "Whether to run in destroy mode (true/false)") flag.StringVar(&(dataString.HelmChartsBranch), "helm_charts_branch", "", "Helm charts branch") flag.StringVar(&(dataString.CloudwatchAgentRepository), "cloudwatch_agent_repository", "", "CloudWatch Agent repository") flag.StringVar(&(dataString.CloudwatchAgentTag), "cloudwatch_agent_tag", "", "CloudWatch Agent tag") @@ -318,6 +321,7 @@ func GetEnvironmentMetaData() *MetaData { metaDataStorage.AmpWorkspaceId = registeredMetaDataStrings.AmpWorkspaceId metaDataStorage.Region = registeredMetaDataStrings.Region metaDataStorage.K8sVersion = registeredMetaDataStrings.K8sVersion + metaDataStorage.Destroy = registeredMetaDataStrings.Destroy == "true" metaDataStorage.HelmChartsBranch = registeredMetaDataStrings.HelmChartsBranch metaDataStorage.CloudwatchAgentRepository = registeredMetaDataStrings.CloudwatchAgentRepository metaDataStorage.CloudwatchAgentTag = registeredMetaDataStrings.CloudwatchAgentTag diff --git a/generator/resources/eks_e2e_jmx_test_matrix.json b/generator/resources/eks_e2e_jmx_test_matrix.json new file mode 100644 index 000000000..d484997f7 --- /dev/null +++ b/generator/resources/eks_e2e_jmx_test_matrix.json @@ -0,0 +1,5 @@ +[ + { + "nodes": 1 + } +] \ No newline at end of file diff --git a/generator/resources/eks_e2e_test_matrix.json b/generator/resources/eks_e2e_test_matrix.json deleted file mode 100644 index 81ed61090..000000000 --- a/generator/resources/eks_e2e_test_matrix.json +++ /dev/null @@ -1,5 +0,0 @@ -[ - { - - } -] \ No newline at end of file diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index dc9a5d308..4daaae1d2 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -30,6 +30,7 @@ type matrixRow struct { CaCertPath string `json:"caCertPath"` ValuesPerMinute int `json:"values_per_minute"` // Number of metrics to be sent or number of log lines to write K8sVersion string `json:"k8sVersion"` + Nodes int `json:"nodes"` TerraformDir string `json:"terraform_dir"` UseSSM bool `json:"useSSM"` ExcludedTests string `json:"excludedTests"` @@ -247,7 +248,7 @@ var testTypeToTestConfig = map[string][]testConfig{ } var testTypeToTestConfigE2E = map[string][]testConfig{ - "eks_e2e": { + "eks_e2e_jmx": { {testDir: "../../../test/e2e/jmx"}, }, } diff --git a/terraform/eks/e2e/main.tf b/terraform/eks/e2e/main.tf index 064e0b21b..bfd0c8dfe 100644 --- a/terraform/eks/e2e/main.tf +++ b/terraform/eks/e2e/main.tf @@ -39,10 +39,10 @@ resource "aws_eks_node_group" "this" { min_size = var.nodes } - ami_type = "AL2_x86_64" + ami_type = var.ami_type capacity_type = "ON_DEMAND" disk_size = 20 - instance_types = ["t3a.medium"] + instance_types = [var.instance_type] depends_on = [ aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy, @@ -52,6 +52,15 @@ resource "aws_eks_node_group" "this" { ] } +resource "aws_security_group_rule" "nodeport_inbound" { + type = "ingress" + from_port = 30080 + to_port = 30080 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + security_group_id = aws_eks_cluster.this.vpc_config[0].cluster_security_group_id +} + resource "aws_iam_role" "node_role" { name = "${local.cluster_name}-Worker-Role-${module.common.testing_id}" @@ -109,6 +118,12 @@ resource "null_resource" "helm_charts" { resource "null_resource" "validator" { depends_on = [aws_eks_cluster.this, aws_eks_node_group.this, null_resource.helm_charts] + triggers = { + cluster_name = aws_eks_cluster.this.name + region = var.region + test_dir = var.test_dir + } + provisioner "local-exec" { command = <<-EOT echo "Validating K8s resources and metrics" @@ -134,4 +149,17 @@ resource "null_resource" "validator" { -sample_app="${var.test_dir}/${var.sample_app}" EOT } + + provisioner "local-exec" { + when = destroy + command = <<-EOT + echo "Running cleanup for K8s resources" + go test -timeout 30m -v ${self.triggers.test_dir} \ + -destroy=true \ + -region=${self.triggers.region} \ + -eksClusterName=${self.triggers.cluster_name} \ + -computeType=EKS \ + -eksDeploymentStrategy=DAEMON + EOT + } } diff --git a/terraform/eks/e2e/variables.tf b/terraform/eks/e2e/variables.tf index ac0d38ebc..016bbb318 100644 --- a/terraform/eks/e2e/variables.tf +++ b/terraform/eks/e2e/variables.tf @@ -21,6 +21,16 @@ variable "nodes" { default = 1 } +variable "ami_type" { + type = string + default = "AL2_x86_64" +} + +variable "instance_type" { + type = string + default = "t3a.medium" +} + variable "helm_charts_branch" { type = string default = "main" diff --git a/test/e2e/jmx/jmx_test.go b/test/e2e/jmx/jmx_test.go new file mode 100644 index 000000000..a2d5ccb2b --- /dev/null +++ b/test/e2e/jmx/jmx_test.go @@ -0,0 +1,364 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package main + +import ( + "context" + "flag" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" + "github.com/aws/amazon-cloudwatch-agent-test/util/common" +) + +//------------------------------------------------------------------------------ +// Constants and Variables +//------------------------------------------------------------------------------ + +const ( + wait = 5 * time.Minute + interval = 30 * time.Second +) + +var ( + nodeNames []string + env *environment.MetaData +) + +//------------------------------------------------------------------------------ +// Test Registry Maps +//------------------------------------------------------------------------------ + +var testMetricsRegistry = map[string][]func(*testing.T){ + "jvm_tomcat.json": { + testTomcatMetrics, + testTomcatSessions, + }, + "kafka.json": { + testKafkaMetrics, + }, + "containerinsights.json": { + testContainerInsightsMetrics, + testTomcatRejectedSessions, + }, +} + +var testResourcesRegistry = []func(*testing.T, *kubernetes.Clientset){ + testAgentResources, + testJMXResources, +} + +//------------------------------------------------------------------------------ +// Test Setup +//------------------------------------------------------------------------------ + +func init() { + environment.RegisterEnvironmentMetaDataFlags() +} + +func TestMain(m *testing.M) { + flag.Parse() + + // Added this to prevent running tests when we pass in "NO_MATCH" + if flag.Lookup("test.run").Value.String() == "NO_MATCH" { + os.Exit(0) + } + env = environment.GetEnvironmentMetaData() + + // Destroy K8s resources if terraform destroy + if env.Destroy { + if err := common.DestroyResources(env); err != nil { + fmt.Printf("Failed to delete kubernetes resources: %v\n", err) + } + os.Exit(0) + } + + // Configure AWS clients and create K8s resources + if err := common.InitializeEnvironment(env); err != nil { + fmt.Printf("Failed to initialize environment: %v\n", err) + os.Exit(1) + } + + // Get names of nodes so they can be used as dimensions to check for metrics + eksInstances, err := awsservice.GetEKSInstances(env.EKSClusterName) + if err != nil || len(eksInstances) == 0 { + fmt.Printf("Failed to get EKS instances: %v", err) + os.Exit(1) + } + + for _, instance := range eksInstances { + if instance.InstanceName != nil { + nodeNames = append(nodeNames, *instance.InstanceName) + } + } + + os.Exit(m.Run()) +} + +//------------------------------------------------------------------------------ +// Main Test Functions +//------------------------------------------------------------------------------ + +func TestAll(t *testing.T) { + t.Run("Resources", func(t *testing.T) { + testResources(t) + }) + + // Don't run metric tests if resource tests fail + if !t.Failed() { + t.Run("Metrics", func(t *testing.T) { + testMetrics(t) + }) + } +} + +func testResources(t *testing.T) { + tests := testResourcesRegistry + + config, err := clientcmd.BuildConfigFromFlags("", filepath.Join(os.Getenv("HOME"), ".kube", "config")) + require.NoError(t, err, "Error building kubeconfig") + + clientset, err := kubernetes.NewForConfig(config) + require.NoError(t, err, "Error creating clientset") + + for _, testFunc := range tests { + testFunc(t, clientset) + } +} + +func testMetrics(t *testing.T) { + configFile := filepath.Base(env.AgentConfig) + tests := testMetricsRegistry[configFile] + + fmt.Println("waiting for metrics to propagate...") + time.Sleep(wait) + + for _, testFunc := range tests { + testFunc(t) + } +} + +//------------------------------------------------------------------------------ +// Resource Test Functions +//------------------------------------------------------------------------------ + +func testAgentResources(t *testing.T, clientset *kubernetes.Clientset) { + t.Run("verify_agent_resources", func(t *testing.T) { + daemonSet, err := clientset.AppsV1().DaemonSets("amazon-cloudwatch").Get(context.TODO(), "cloudwatch-agent", metav1.GetOptions{}) + require.NoError(t, err, "Error getting CloudWatch Agent DaemonSet") + require.NotNil(t, daemonSet, "CloudWatch Agent DaemonSet not found") + + configMap, err := clientset.CoreV1().ConfigMaps("amazon-cloudwatch").Get(context.TODO(), "cloudwatch-agent", metav1.GetOptions{}) + require.NoError(t, err, "Error getting CloudWatch Agent ConfigMap") + require.NotNil(t, configMap, "CloudWatch Agent ConfigMap not found") + + cwConfig, exists := configMap.Data["cwagentconfig.json"] + require.True(t, exists, "cwagentconfig.json not found in ConfigMap") + require.Contains(t, cwConfig, `jmx`, "JMX configuration not found in cwagentconfig.json") + + service, err := clientset.CoreV1().Services("amazon-cloudwatch").Get(context.TODO(), "cloudwatch-agent", metav1.GetOptions{}) + require.NoError(t, err, "Error getting CloudWatch Agent Service") + require.NotNil(t, service, "CloudWatch Agent Service not found") + + serviceAccount, err := clientset.CoreV1().ServiceAccounts("amazon-cloudwatch").Get(context.TODO(), "cloudwatch-agent", metav1.GetOptions{}) + require.NoError(t, err, "Error getting CloudWatch Agent Service Account") + require.NotNil(t, serviceAccount, "CloudWatch Agent Service Account not found") + }) +} + +func testJMXResources(t *testing.T, clientset *kubernetes.Clientset) { + t.Run("verify_jmx_resources", func(t *testing.T) { + deploymentName := strings.TrimSuffix(filepath.Base(env.SampleApp), ".yaml") + pods, err := clientset.CoreV1().Pods("test").List(context.TODO(), metav1.ListOptions{ + LabelSelector: fmt.Sprintf("app=%s", deploymentName), + FieldSelector: "status.phase=Running", + }) + require.NoError(t, err, "Error getting pods for deployment") + require.NotEmpty(t, pods.Items, "No pods found for deployment") + + var jmxTargetSystem string + switch filepath.Base(env.AgentConfig) { + case "jvm_tomcat.json", "containerinsights.json": + jmxTargetSystem = "jvm,tomcat" + case "kafka.json": + jmxTargetSystem = "kafka" + } + + requiredEnvVars := map[string]string{ + "OTEL_EXPORTER_OTLP_PROTOCOL": "http/protobuf", + "OTEL_METRICS_EXPORTER": "none", + "OTEL_LOGS_EXPORTER": "none", + "OTEL_TRACES_EXPORTER": "none", + "OTEL_AWS_JMX_EXPORTER_METRICS_ENDPOINT": "http://cloudwatch-agent.amazon-cloudwatch:4314/v1/metrics", + "OTEL_JMX_TARGET_SYSTEM": jmxTargetSystem, + "JAVA_TOOL_OPTIONS": " -javaagent:/otel-auto-instrumentation-java/javaagent.jar", + } + + for _, container := range pods.Items[0].Spec.Containers { + for _, envVar := range container.Env { + if expectedValue, exists := requiredEnvVars[envVar.Name]; exists { + require.Equal(t, expectedValue, envVar.Value, fmt.Sprintf("Unexpected value for environment variable %s in container %s", envVar.Name, container.Name)) + delete(requiredEnvVars, envVar.Name) + } + } + } + + require.Empty(t, requiredEnvVars, "Not all required environment variables were found in the pod") + }) +} + +//------------------------------------------------------------------------------ +// Metric Test Functions +//------------------------------------------------------------------------------ + +func testTomcatMetrics(t *testing.T) { + t.Run("verify_jvm_tomcat_metrics", func(t *testing.T) { + validateMetrics(t, []string{ + "jvm.classes.loaded", + "jvm.gc.collections.count", + "jvm.gc.collections.elapsed", + "jvm.memory.heap.init", + "jvm.memory.heap.max", + "jvm.memory.heap.used", + "jvm.memory.heap.committed", + "jvm.memory.nonheap.init", + "jvm.memory.nonheap.max", + "jvm.memory.nonheap.used", + "jvm.memory.nonheap.committed", + "jvm.memory.pool.init", + "jvm.memory.pool.max", + "jvm.memory.pool.used", + "jvm.memory.pool.committed", + "jvm.threads.count", + "tomcat.traffic", + "tomcat.sessions", + "tomcat.errors", + "tomcat.request_count", + "tomcat.max_time", + "tomcat.processing_time", + "tomcat.threads", + }, "JVM_TOMCAT_E2E") + }) +} + +func testTomcatSessions(t *testing.T) { + t.Run("verify_tomcat_sessions", func(t *testing.T) { + generateTraffic(t) + time.Sleep(wait) + verifyMetricAboveZero(t, "tomcat.sessions", "JVM_TOMCAT_E2E", false) + }) +} + +func testKafkaMetrics(t *testing.T) { + t.Run("verify_kafka_metrics", func(t *testing.T) { + validateMetrics(t, []string{ + "kafka.message.count", + "kafka.request.count", + "kafka.request.failed", + "kafka.request.time.total", + "kafka.request.time.50p", + "kafka.request.time.99p", + "kafka.request.time.avg", + "kafka.consumer.fetch-rate", + "kafka.consumer.total.bytes-consumed-rate", + "kafka.consumer.total.records-consumed-rate", + "kafka.producer.io-wait-time-ns-avg", + "kafka.producer.outgoing-byte-rate", + "kafka.producer.response-rate", + }, "KAFKA_E2E") + }) +} + +func testContainerInsightsMetrics(t *testing.T) { + t.Run("verify_containerinsights_metrics", func(t *testing.T) { + validateMetrics(t, []string{ + "jvm_classes_loaded", + "jvm_threads_current", + "jvm_threads_daemon", + "java_lang_operatingsystem_totalswapspacesize", + "java_lang_operatingsystem_systemcpuload", + "java_lang_operatingsystem_processcpuload", + "java_lang_operatingsystem_freeswapspacesize", + "java_lang_operatingsystem_totalphysicalmemorysize", + "java_lang_operatingsystem_freephysicalmemorysize", + "java_lang_operatingsystem_openfiledescriptorcount", + "java_lang_operatingsystem_availableprocessors", + "jvm_memory_bytes_used", + "jvm_memory_pool_bytes_used", + "catalina_manager_activesessions", + "catalina_manager_rejectedsessions", + "catalina_globalrequestprocessor_requestcount", + "catalina_globalrequestprocessor_errorcount", + "catalina_globalrequestprocessor_processingtime", + }, "ContainerInsights/Prometheus") + }) +} + +func testTomcatRejectedSessions(t *testing.T) { + t.Run("verify_catalina_manager_rejectedsessions", func(t *testing.T) { + generateTraffic(t) + time.Sleep(wait) + verifyMetricAboveZero(t, "catalina_manager_rejectedsessions", "ContainerInsights/Prometheus", true) + }) +} + +//------------------------------------------------------------------------------ +// Helper Functions +//------------------------------------------------------------------------------ + +func validateMetrics(t *testing.T, metrics []string, namespace string) { + for _, metric := range metrics { + t.Run(metric, func(t *testing.T) { + awsservice.ValidateMetricWithTest(t, metric, namespace, nil, 5, interval) + }) + } +} + +func generateTraffic(t *testing.T) { + cmd := exec.Command("kubectl", "get", "nodes", "-o", "jsonpath='{.items[0].status.addresses[?(@.type==\"ExternalIP\")].address}'") + output, err := cmd.CombinedOutput() + require.NoError(t, err, "Error getting node external IP") + + nodeIP := strings.Trim(string(output), "'") + require.NotEmpty(t, nodeIP, "Node IP failed to format") + + for i := 0; i < 5; i++ { + resp, err := http.Get(fmt.Sprintf("http://%s:30080/webapp/index.jsp", nodeIP)) + if err != nil { + t.Logf("Request attempt %d failed: %v", i+1, err) + continue + } + require.NoError(t, resp.Body.Close(), "Failed to close response body") + } +} + +func verifyMetricAboveZero(t *testing.T, metricName, namespace string, containerInsights bool) { + startTime := time.Now().Add(-wait) + endTime := time.Now() + + aboveZero, err := awsservice.CheckMetricAboveZero( + metricName, + namespace, + startTime, + endTime, + 60, + nodeNames, + containerInsights, + ) + require.NoError(t, err, "Failed to check metric above zero") + require.True(t, aboveZero, fmt.Sprintf("Expected non-zero %s after applying traffic", metricName)) +} diff --git a/test/e2e/jmx/resources/cwagent_configs/containerinsights.json b/test/e2e/jmx/resources/cwagent_configs/containerinsights.json new file mode 100644 index 000000000..9ee4ef445 --- /dev/null +++ b/test/e2e/jmx/resources/cwagent_configs/containerinsights.json @@ -0,0 +1,10 @@ +{ + "logs": { + "metrics_collected": { + "kubernetes": { + "cluster_name": "TestCluster", + "jmx_container_insights": true + } + } + } +} \ No newline at end of file diff --git a/test/e2e/jmx/resources/cwagent_configs/jvm_tomcat.json b/test/e2e/jmx/resources/cwagent_configs/jvm_tomcat.json new file mode 100644 index 000000000..ec03d2641 --- /dev/null +++ b/test/e2e/jmx/resources/cwagent_configs/jvm_tomcat.json @@ -0,0 +1,40 @@ +{ + "metrics": { + "namespace": "JVM_TOMCAT_E2E", + "metrics_collected": { + "jmx": { + "jvm": { + "measurement": [ + "jvm.classes.loaded", + "jvm.gc.collections.count", + "jvm.gc.collections.elapsed", + "jvm.memory.heap.init", + "jvm.memory.heap.max", + "jvm.memory.heap.used", + "jvm.memory.heap.committed", + "jvm.memory.nonheap.init", + "jvm.memory.nonheap.max", + "jvm.memory.nonheap.used", + "jvm.memory.nonheap.committed", + "jvm.memory.pool.init", + "jvm.memory.pool.max", + "jvm.memory.pool.used", + "jvm.memory.pool.committed", + "jvm.threads.count" + ] + }, + "tomcat": { + "measurement": [ + "tomcat.sessions", + "tomcat.errors", + "tomcat.request_count", + "tomcat.max_time", + "tomcat.processing_time", + "tomcat.traffic", + "tomcat.threads" + ] + } + } + } + } +} \ No newline at end of file diff --git a/test/e2e/jmx/resources/cwagent_configs/kafka.json b/test/e2e/jmx/resources/cwagent_configs/kafka.json new file mode 100644 index 000000000..d70fb47f0 --- /dev/null +++ b/test/e2e/jmx/resources/cwagent_configs/kafka.json @@ -0,0 +1,34 @@ +{ + "metrics": { + "namespace": "KAFKA_E2E", + "metrics_collected": { + "jmx": { + "kafka": { + "measurement": [ + "kafka.message.count", + "kafka.request.count", + "kafka.request.failed", + "kafka.request.time.total", + "kafka.request.time.50p", + "kafka.request.time.99p", + "kafka.request.time.avg" + ] + }, + "kafka-producer": { + "measurement": [ + "kafka.producer.io-wait-time-ns-avg", + "kafka.producer.outgoing-byte-rate", + "kafka.producer.response-rate" + ] + }, + "kafka-consumer": { + "measurement": [ + "kafka.consumer.total.records-consumed-rate", + "kafka.consumer.total.bytes-consumed-rate", + "kafka.consumer.fetch-rate" + ] + } + } + } + } +} diff --git a/test/e2e/jmx/resources/sample_apps/kafka.yaml b/test/e2e/jmx/resources/sample_apps/kafka.yaml new file mode 100644 index 000000000..1ef7fcaaa --- /dev/null +++ b/test/e2e/jmx/resources/sample_apps/kafka.yaml @@ -0,0 +1,168 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test +--- +apiVersion: v1 +kind: Service +metadata: + name: zookeeper-service + namespace: test +spec: + selector: + app: zookeeper + ports: + - port: 2181 + targetPort: 2181 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: zookeeper + namespace: test +spec: + replicas: 1 + selector: + matchLabels: + app: zookeeper + template: + metadata: + labels: + app: zookeeper + spec: + containers: + - name: zookeeper + image: public.ecr.aws/l9b8e0i6/zookeeper:latest + ports: + - containerPort: 2181 + env: + - name: ZOOKEEPER_CLIENT_PORT + value: "2181" + readinessProbe: + tcpSocket: + port: 2181 + initialDelaySeconds: 10 + periodSeconds: 10 + livenessProbe: + tcpSocket: + port: 2181 + initialDelaySeconds: 10 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: kafka-service + namespace: test +spec: + selector: + app: kafka + ports: + - port: 9092 + targetPort: 9092 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka + namespace: test +spec: + replicas: 1 + selector: + matchLabels: + app: kafka + template: + metadata: + labels: + app: kafka + annotations: + instrumentation.opentelemetry.io/inject-java: "true" + cloudwatch.aws.amazon.com/inject-jmx-kafka: "true" + spec: + initContainers: + - name: wait-for-zookeeper + image: busybox + command: ["/bin/sh"] + args: ["-c", "until nc -z zookeeper-service 2181; do echo waiting for zookeeper; sleep 2; done"] + containers: + - name: kafka + image: public.ecr.aws/l9b8e0i6/kafka:latest + ports: + - containerPort: 9092 + env: + - name: KAFKA_ZOOKEEPER_CONNECT + value: "zookeeper-service:2181" + - name: KAFKA_LISTENERS + value: "PLAINTEXT://:9092" + - name: KAFKA_ADVERTISED_LISTENERS + value: "PLAINTEXT://kafka-service:9092" + - name: KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR + value: "1" + readinessProbe: + tcpSocket: + port: 9092 + initialDelaySeconds: 30 + periodSeconds: 10 + livenessProbe: + tcpSocket: + port: 9092 + initialDelaySeconds: 30 + periodSeconds: 10 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka-producer + namespace: test +spec: + replicas: 1 + selector: + matchLabels: + app: kafka-producer + template: + metadata: + labels: + app: kafka-producer + annotations: + instrumentation.opentelemetry.io/inject-java: "true" + cloudwatch.aws.amazon.com/inject-jmx-kafka-producer: "true" + spec: + initContainers: + - name: wait-for-kafka + image: busybox + command: ["/bin/sh"] + args: ["-c", "until nc -z kafka-service 9092; do echo waiting for kafka; sleep 2; done"] + containers: + - name: kafka-producer + image: public.ecr.aws/l9b8e0i6/kafka:latest + command: ["/bin/sh"] + args: ["-c", "yes | nohup kafka-console-producer.sh --topic quickstart-events --bootstrap-server kafka-service:9092"] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka-consumer + namespace: test +spec: + replicas: 1 + selector: + matchLabels: + app: kafka-consumer + template: + metadata: + labels: + app: kafka-consumer + annotations: + instrumentation.opentelemetry.io/inject-java: "true" + cloudwatch.aws.amazon.com/inject-jmx-kafka-consumer: "true" + spec: + initContainers: + - name: wait-for-kafka + image: busybox + command: ["/bin/sh"] + args: ["-c", "until nc -z kafka-service 9092; do echo waiting for kafka; sleep 2; done"] + containers: + - name: kafka-consumer + image: public.ecr.aws/l9b8e0i6/kafka:latest + command: ["/bin/sh"] + args: ["-c", "kafka-console-consumer.sh --topic quickstart-events --from-beginning --bootstrap-server kafka-service:9092"] \ No newline at end of file diff --git a/test/e2e/jmx/resources/sample_apps/tomcat.yaml b/test/e2e/jmx/resources/sample_apps/tomcat.yaml new file mode 100644 index 000000000..1ddb8dc4a --- /dev/null +++ b/test/e2e/jmx/resources/sample_apps/tomcat.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test +--- +apiVersion: v1 +kind: Service +metadata: + name: tomcat-service + namespace: test +spec: + type: NodePort + selector: + app: tomcat + ports: + - port: 80 + targetPort: 8080 + nodePort: 30080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tomcat + namespace: test +spec: + replicas: 1 + selector: + matchLabels: + app: tomcat + template: + metadata: + labels: + app: tomcat + annotations: + instrumentation.opentelemetry.io/inject-java: "true" + cloudwatch.aws.amazon.com/inject-jmx-jvm: "true" + cloudwatch.aws.amazon.com/inject-jmx-tomcat: "true" + spec: + containers: + - name: tomcat-container + image: public.ecr.aws/l9b8e0i6/tomcat:latest + ports: + - containerPort: 8080 \ No newline at end of file diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go index 59ef886b2..2b94e2696 100644 --- a/util/awsservice/cloudwatchmetrics.go +++ b/util/awsservice/cloudwatchmetrics.go @@ -134,6 +134,81 @@ func GetMetricStatistics( return CwmClient.GetMetricStatistics(ctx, &metricStatsInput) } +func CheckMetricAboveZero( + metricName string, + namespace string, + startTime time.Time, + endTime time.Time, + periodInSeconds int32, + nodeNames []string, + containerInsights bool, +) (bool, error) { + metrics, err := CwmClient.ListMetrics(ctx, &cloudwatch.ListMetricsInput{ + MetricName: aws.String(metricName), + Namespace: aws.String(namespace), + RecentlyActive: "PT3H", + }) + + if err != nil { + return false, err + } + + if len(metrics.Metrics) == 0 { + return false, fmt.Errorf("no metrics found for %s", metricName) + } + + for _, metric := range metrics.Metrics { + // Skip node name check if containerInsights is true + if !containerInsights { + var nodeNameMatch bool + var nodeName string + for _, dim := range metric.Dimensions { + if *dim.Name == "k8s.node.name" { + nodeName = *dim.Value + for _, name := range nodeNames { + if nodeName == name { + nodeNameMatch = true + break + } + } + break + } + } + if !nodeNameMatch { + continue + } + log.Printf("Checking metric: %s for node: %s", *metric.MetricName, nodeName) + } + + data, err := GetMetricStatistics( + metricName, + namespace, + metric.Dimensions, + startTime, + endTime, + periodInSeconds, + []types.Statistic{types.StatisticMaximum}, + nil, + ) + + if err != nil { + log.Printf("Error getting statistics for metric with dimensions %v: %v", metric.Dimensions, err) + continue + } + + for _, datapoint := range data.Datapoints { + if *datapoint.Maximum > 0 { + if !containerInsights { + log.Printf("Found value above zero") + } + return true, nil + } + } + } + + return false, nil +} + // GetMetricData takes the metric name, metric dimension and metric namespace and return the query metrics func GetMetricData(metricDataQueries []types.MetricDataQuery, startTime, endTime time.Time) (*cloudwatch.GetMetricDataOutput, error) { getMetricDataInput := cloudwatch.GetMetricDataInput{ diff --git a/util/common/kubernetes.go b/util/common/kubernetes.go new file mode 100644 index 000000000..5645894e2 --- /dev/null +++ b/util/common/kubernetes.go @@ -0,0 +1,141 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package common + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/aws/amazon-cloudwatch-agent-test/environment" + "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" +) + +//------------------------------------------------------------------------------ +// Environment Setup +//------------------------------------------------------------------------------ + +func InitializeEnvironment(env *environment.MetaData) error { + if env.Region != "us-west-2" { + if err := awsservice.ConfigureAWSClients(env.Region); err != nil { + return fmt.Errorf("failed to reconfigure AWS clients: %v", err) + } + fmt.Printf("AWS clients reconfigured to use region: %s\n", env.Region) + } else { + fmt.Printf("Using default testing region: us-west-2\n") + } + + fmt.Println("Applying K8s resources...") + if err := ApplyResources(env); err != nil { + return fmt.Errorf("failed to apply K8s resources: %v", err) + } + + return nil +} + +//------------------------------------------------------------------------------ +// K8s Resource Management Functions +//------------------------------------------------------------------------------ + +func ApplyResources(env *environment.MetaData) error { + updateKubeconfig := exec.Command("aws", "eks", "update-kubeconfig", "--name", env.EKSClusterName) + output, err := updateKubeconfig.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to update kubeconfig: %w\nOutput: %s", err, output) + } + + fmt.Println("Installing Helm release...") + helm := []string{ + "helm", "upgrade", "--install", "amazon-cloudwatch-observability", + filepath.Join("..", "..", "..", "terraform", "eks", "e2e", "helm-charts", "charts", "amazon-cloudwatch-observability"), + "--set", fmt.Sprintf("clusterName=%s", env.EKSClusterName), + "--set", fmt.Sprintf("region=%s", env.Region), + "--set", fmt.Sprintf("agent.image.repository=%s", env.CloudwatchAgentRepository), + "--set", fmt.Sprintf("agent.image.tag=%s", env.CloudwatchAgentTag), + "--set", fmt.Sprintf("agent.image.repositoryDomainMap.public=%s", env.CloudwatchAgentRepositoryURL), + "--set", fmt.Sprintf("manager.image.repository=%s", env.CloudwatchAgentOperatorRepository), + "--set", fmt.Sprintf("manager.image.tag=%s", env.CloudwatchAgentOperatorTag), + "--set", fmt.Sprintf("manager.image.repositoryDomainMap.public=%s", env.CloudwatchAgentOperatorRepositoryURL), + "--namespace", "amazon-cloudwatch", + "--create-namespace", + } + + if env.AgentConfig != "" { + agentConfigContent, err := os.ReadFile(env.AgentConfig) + if err != nil { + return fmt.Errorf("failed to read agent config file: %w", err) + } + helm = append(helm, "--set-json", fmt.Sprintf("agent.config=%s", string(agentConfigContent))) + } + + helmUpgrade := exec.Command(helm[0], helm[1:]...) + helmUpgrade.Stdout = os.Stdout + helmUpgrade.Stderr = os.Stderr + if err := helmUpgrade.Run(); err != nil { + return fmt.Errorf("failed to install Helm release: %w", err) + } + + fmt.Println("Waiting for CloudWatch Agent Operator to initialize...") + wait := exec.Command("kubectl", "wait", "--for=condition=available", "--timeout=60s", "deployment/amazon-cloudwatch-observability-controller-manager", "-n", "amazon-cloudwatch") + output, err = wait.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to wait for operator deployment: %w\nOutput: %s", err, output) + } + + deploymentName := strings.TrimSuffix(filepath.Base(env.SampleApp), ".yaml") + + apply := exec.Command("kubectl", "apply", "-f", env.SampleApp) + output, err = apply.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to apply sample app: %w\nOutput: %s", err, output) + } + + fmt.Println("Waiting for Sample Application to initialize...") + wait = exec.Command("kubectl", "wait", "--for=condition=available", "--timeout=300s", fmt.Sprintf("deployment/%s", deploymentName), "-n", "test") + output, err = wait.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to wait for deployment %s: %w\nOutput: %s", deploymentName, err, output) + } + + return nil +} + +func DestroyResources(env *environment.MetaData) error { + updateKubeconfig := exec.Command("aws", "eks", "update-kubeconfig", "--name", env.EKSClusterName) + output, err := updateKubeconfig.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to update kubeconfig: %w\nOutput: %s", err, output) + } + + var errors []error + + fmt.Println("Deleting test namespace...") + deleteCmd := exec.Command("kubectl", "delete", "namespace", "test", "--timeout=60s") + output, err = deleteCmd.CombinedOutput() + + // We don't want to consider not finding the namespace to be an error since that's the outcome we want + if err != nil && !strings.Contains(string(output), "not found") { + errors = append(errors, fmt.Errorf("failed to delete test namespace: %w\nOutput: %s", err, output)) + } + + fmt.Println("Uninstalling Helm release...") + helm := []string{ + "helm", "uninstall", "amazon-cloudwatch-observability", "--namespace", "amazon-cloudwatch", + } + + helmUninstall := exec.Command(helm[0], helm[1:]...) + helmUninstall.Stdout = os.Stdout + helmUninstall.Stderr = os.Stderr + if err := helmUninstall.Run(); err != nil { + errors = append(errors, fmt.Errorf("failed to uninstall Helm release: %w", err)) + } + + if len(errors) > 0 { + return fmt.Errorf("cleanup errors: %v", errors) + } + + return nil +}