diff --git a/test/metric/metric_list_query.go b/test/metric/metric_list_query.go index 3bc9099a4..8a3804efe 100644 --- a/test/metric/metric_list_query.go +++ b/test/metric/metric_list_query.go @@ -31,18 +31,54 @@ func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []typ listMetricInput := cloudwatch.ListMetricsInput{ Namespace: aws.String(namespace), - MetricName: aws.String(metricName), Dimensions: dims, } + if len(metricName) > 0 { + listMetricInput.MetricName = aws.String(metricName) + } log.Printf("Metric data input: namespace %v, name %v", namespace, metricName) + var metrics []types.Metric + for { + // get a complete list of metrics with given dimensions + output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput) + if err != nil { + return nil, fmt.Errorf("Error getting metric data %v", err) + } + metrics = append(metrics, output.Metrics...) + // nil or empty nextToken means there is no more data to be fetched + nextToken := output.NextToken + if nextToken == nil || *nextToken == "" { + break + } + listMetricInput.NextToken = nextToken + } + log.Printf("total number of metrics fetched: %v", len(metrics)) + return metrics, nil +} + +func (n *MetricListFetcher) FetchByDimension(namespace string, dimensions []types.Dimension) ([]types.Metric, error) { + var dims []types.DimensionFilter + for _, dim := range dimensions { + dims = append(dims, types.DimensionFilter{ + Name: dim.Name, + Value: dim.Value, + }) + } + + listMetricInput := cloudwatch.ListMetricsInput{ + Namespace: aws.String(namespace), + Dimensions: dims, + } + + log.Printf("Metric data input: namespace %v, dimensions %v", namespace, fmt.Sprint(&dims)) output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput) if err != nil { return nil, fmt.Errorf("Error getting metric data %v", err) } - log.Printf("Metrics fetched : %s", fmt.Sprint(output)) + log.Printf("Metrics fetched : %v", output.Metrics) return output.Metrics, nil } diff --git a/test/metric/stat.go b/test/metric/stat.go index 763028566..d633985d3 100644 --- a/test/metric/stat.go +++ b/test/metric/stat.go @@ -13,4 +13,6 @@ const ( MAXUMUM Statistics = "Maxmimum" SUM Statistics = "Sum" HighResolutionStatPeriod = 10 + + MinuteStatPeriod = 60 ) diff --git a/test/metric_value_benchmark/eks_daemonset_test.go b/test/metric_value_benchmark/eks_daemonset_test.go index 35e4298a7..ca01673c6 100644 --- a/test/metric_value_benchmark/eks_daemonset_test.go +++ b/test/metric_value_benchmark/eks_daemonset_test.go @@ -10,14 +10,16 @@ import ( "errors" "fmt" "log" + "math/rand" + "sort" + "strings" "time" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" - "golang.org/x/exp/slices" "github.com/aws/amazon-cloudwatch-agent-test/environment" "github.com/aws/amazon-cloudwatch-agent-test/test/metric" - "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" "github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" @@ -25,22 +27,17 @@ import ( ) const containerInsightsNamespace = "ContainerInsights" - -// list of metrics with more dimensions e.g. PodName and Namespace -var metricsWithMoreDimensions = []string{"pod_number_of_container_restarts"} +const gpuMetricIndicator = "_gpu_" type EKSDaemonTestRunner struct { test_runner.BaseTestRunner - env *environment.MetaData + testName string + env *environment.MetaData } func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult { - metrics := e.GetMeasuredMetrics() - testResults := make([]status.TestResult, 0) - for _, name := range metrics { - testResults = append(testResults, e.validateInstanceMetrics(name)) - } - + var testResults []status.TestResult + testResults = append(testResults, validateMetrics(e.env, gpuMetricIndicator, eks_resources.ExpectedDimsToMetrics)...) testResults = append(testResults, e.validateLogs(e.env)) return status.TestGroupResult{ Name: e.GetTestName(), @@ -48,54 +45,143 @@ func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult { } } -func (e *EKSDaemonTestRunner) validateInstanceMetrics(name string) status.TestResult { - testResult := status.TestResult{ - Name: name, - Status: status.FAILED, +const ( + dimDelimiter = "-" + ContainerInsightsNamespace = "ContainerInsights" +) + +type dimToMetrics struct { + // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace + dimStr string + // metric names to their dimensions with values. Dimension sets will be used for metric data validations + metrics map[string][][]types.Dimension +} + +func validateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult { + var results []status.TestResult + dimsToMetrics := getMetricsInClusterDimension(env, metricFilter) + //loops through each dimension set and checks if they exit in the cluster(fails if it doesn't) + for dims, metrics := range expectedDimsToMetrics { + var actual map[string][][]types.Dimension + //looping through dtms until we find the dimension string equal to the one in the hard coded map + for _, dtm := range dimsToMetrics { + log.Printf("dtm: %s vs dims %s", dtm.dimStr, dims) //testing purposes + if dtm.dimStr == dims { + actual = dtm.metrics + break + } + } + //if there are no metrics for the dimension set, we fail the test + if len(actual) < 1 { + results = append(results, status.TestResult{ + Name: dims, + Status: status.FAILED, + }) + log.Printf("ValidateMetrics failed with missing dimension set: %s", dims) + // keep testing other dims or fail early? + continue + } + //verifies length of metrics for dimension set + results = append(results, validateMetricsAvailability(dims, metrics, actual)) + for _, m := range metrics { + // picking a random dimension set to test metric data so we don't have to test every dimension set + randIdx := rand.Intn(len(actual[m])) + //verifys values of metrics + results = append(results, validateMetricValue(m, actual[m][randIdx])) + } } + return results +} - dims, failed := e.DimensionFactory.GetDimensions([]dimension.Instruction{ +// Fetches all metrics in cluster +func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} { + listFetcher := metric.MetricListFetcher{} + log.Printf("Fetching by cluster dimension") + dims := []types.Dimension{ { - Key: "ClusterName", - Value: dimension.UnknownDimensionValue(), + Name: aws.String("ClusterName"), + Value: aws.String(env.EKSClusterName), }, - }) - if len(failed) > 0 { - log.Println("failed to get dimensions") - return testResult + } + metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) + if err != nil { + log.Println("failed to fetch metric list", err) + return nil + } + if len(metrics) < 1 { + log.Println("cloudwatch metric list is empty") + return nil } - // get list of metrics that has more dimensions for container insights - // this is to avoid adding more dimension provider for non-trivial dimensions e.g. PodName - listFetcher := metric.MetricListFetcher{} - if slices.Contains(metricsWithMoreDimensions, name) { - metrics, err := listFetcher.Fetch(containerInsightsNamespace, name, dims) - if err != nil { - log.Println("failed to fetch metric list", err) - return testResult + var results []dimToMetrics + for _, m := range metrics { + // filter by metric name filter(skip gpu validation) + if metricFilter != "" && strings.Contains(*m.MetricName, metricFilter) { + continue } - - if len(metrics) < 1 { - log.Println("metric list is empty") - return testResult + var dims []string + for _, d := range m.Dimensions { + dims = append(dims, *d.Name) } - - // just verify 1 of returned metrics for values - for _, dim := range metrics[0].Dimensions { - // skip since it's provided by dimension provider - if *dim.Name == "ClusterName" { - continue + sort.Sort(sort.StringSlice(dims)) //what's the point of sorting? + dimsKey := strings.Join(dims, dimDelimiter) + log.Printf("processing dims: %s", dimsKey) + + var dtm dimToMetrics + for _, ele := range results { + if ele.dimStr == dimsKey { + dtm = ele + break } + } + if dtm.dimStr == "" { + dtm = dimToMetrics{ + dimStr: dimsKey, + metrics: make(map[string][][]types.Dimension), + } + results = append(results, dtm) + } + dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions) + } + return results +} - dims = append(dims, types.Dimension{ - Name: dim.Name, - Value: dim.Value, - }) +// Check if all metrics from cluster matches hard coded map +func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult { + testResult := status.TestResult{ + Name: dims, + Status: status.FAILED, + } + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) + if compareMetrics(expected, actual) { + testResult.Status = status.SUCCESSFUL + } else { + log.Printf("validateMetricsAvailability failed for %s", dims) + } + return testResult +} + +func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool { + if len(expected) != len(actual) { + return false + } + + for _, key := range expected { + if _, ok := actual[key]; !ok { + return false } } + return true +} +func validateMetricValue(name string, dims []types.Dimension) status.TestResult { + log.Printf("validateMetricValue with metric: %s", name) + testResult := status.TestResult{ + Name: name, + Status: status.FAILED, + } valueFetcher := metric.MetricValueFetcher{} - values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.AVERAGE, metric.HighResolutionStatPeriod) + values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.SAMPLE_COUNT, metric.MinuteStatPeriod) if err != nil { log.Println("failed to fetch metrics", err) return testResult @@ -133,6 +219,7 @@ func (e *EKSDaemonTestRunner) validateLogs(env *environment.MetaData) status.Tes nil, &now, awsservice.AssertLogsNotEmpty(), + awsservice.AssertNoDuplicateLogs(), awsservice.AssertPerLog( awsservice.AssertLogSchema(func(message string) (string, error) { var eksClusterType awsservice.EKSClusterType diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index d4e73912a..f1a480665 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -3,7 +3,9 @@ package eks_resources -import _ "embed" +import ( + _ "embed" +) var ( //go:embed test_schemas/cluster.json @@ -52,3 +54,199 @@ var ( "PodNet": eksPodNetSchema, } ) + +// Hard coded map which lists the expected metrics in each dimension set +var ExpectedDimsToMetrics = map[string][]string{ + "ClusterName": { + "pod_number_of_containers", + "node_status_allocatable_pods", + "pod_number_of_container_restarts", + "node_status_condition_unknown", + "node_number_of_running_pods", + "pod_container_status_running", + "node_status_condition_ready", + "pod_status_running", + "node_filesystem_utilization", + "pod_container_status_terminated", + "pod_status_pending", + "pod_cpu_utilization", + "node_filesystem_inodes", + "node_diskio_io_service_bytes_total", + "node_status_condition_memory_pressure", + "container_cpu_utilization", + "service_number_of_running_pods", + "pod_memory_utilization_over_pod_limit", + "node_memory_limit", + "pod_cpu_request", + "pod_interface_network_tx_dropped", + "pod_status_succeeded", + "namespace_number_of_running_pods", + "pod_memory_reserved_capacity", + "node_diskio_io_serviced_total", + "pod_network_rx_bytes", + "node_status_capacity_pods", + "pod_status_unknown", + "cluster_failed_node_count", + "container_memory_utilization", + "node_memory_utilization", + "node_filesystem_inodes_free", + "container_memory_request", + "container_cpu_limit", + "node_memory_reserved_capacity", + "node_interface_network_tx_dropped", + "pod_cpu_utilization_over_pod_limit", + "container_memory_failures_total", + "pod_status_ready", + "pod_number_of_running_containers", + "cluster_node_count", + "pod_memory_request", + "node_cpu_utilization", + "cluster_number_of_running_pods", + "node_memory_working_set", + "pod_status_failed", + "node_status_condition_pid_pressure", + "pod_status_scheduled", + "node_number_of_running_containers", + "node_cpu_limit", + "node_status_condition_disk_pressure", + "pod_cpu_limit", + "pod_memory_limit", + "node_cpu_usage_total", + "pod_cpu_reserved_capacity", + "pod_network_tx_bytes", + "container_memory_limit", + "pod_memory_utilization", + "node_interface_network_rx_dropped", + "node_network_total_bytes", + "container_cpu_utilization_over_container_limit", + "pod_interface_network_rx_dropped", + "pod_container_status_waiting", + "node_cpu_reserved_capacity", + "container_memory_utilization_over_container_limit", + "container_cpu_request", + }, + "ClusterName-FullPodName-Namespace-PodName": { + "pod_network_tx_bytes", + "pod_interface_network_rx_dropped", + "pod_cpu_limit", + "pod_status_succeeded", + "pod_container_status_waiting", + "pod_number_of_running_containers", + "pod_number_of_container_restarts", + "pod_status_pending", + "pod_status_running", + "pod_container_status_running", + "pod_memory_limit", + "pod_status_unknown", + "pod_memory_utilization_over_pod_limit", + "pod_cpu_request", + "pod_status_scheduled", + "pod_memory_utilization", + "pod_status_failed", + "pod_network_rx_bytes", + "pod_number_of_containers", + "pod_cpu_utilization", + "pod_memory_reserved_capacity", + "pod_status_ready", + "pod_container_status_terminated", + "pod_interface_network_tx_dropped", + "pod_memory_request", + "pod_cpu_reserved_capacity", + "pod_cpu_utilization_over_pod_limit", + }, + "ClusterName-Namespace-PodName": { + "pod_interface_network_rx_dropped", + "pod_status_succeeded", + "pod_container_status_running", + "pod_network_rx_bytes", + "pod_cpu_utilization", + "pod_memory_utilization", + "pod_interface_network_tx_dropped", + "pod_status_ready", + "pod_container_status_terminated", + "pod_cpu_reserved_capacity", + "pod_memory_request", + "pod_status_running", + "pod_status_pending", + "pod_number_of_containers", + "pod_memory_utilization_over_pod_limit", + "pod_status_unknown", + "pod_cpu_limit", + "pod_container_status_waiting", + "pod_memory_reserved_capacity", + "pod_network_tx_bytes", + "pod_status_failed", + "pod_number_of_running_containers", + "pod_number_of_container_restarts", + "pod_cpu_request", + "pod_cpu_utilization_over_pod_limit", + "pod_status_scheduled", + "pod_memory_limit", + }, + + "ClusterName-InstanceId-NodeName": { + "node_status_allocatable_pods", + "node_network_total_bytes", + "node_status_condition_unknown", + "node_interface_network_rx_dropped", + "node_number_of_running_containers", + "node_interface_network_tx_dropped", + "node_memory_utilization", + "node_cpu_limit", + "node_status_condition_disk_pressure", + "node_memory_working_set", + "node_cpu_reserved_capacity", + "node_status_condition_ready", + "node_filesystem_utilization", + "node_status_condition_memory_pressure", + "node_memory_limit", + "node_memory_reserved_capacity", + "node_diskio_io_serviced_total", + "node_status_condition_pid_pressure", + "node_filesystem_inodes", + "node_cpu_usage_total", + "node_number_of_running_pods", + "node_diskio_io_service_bytes_total", + "node_status_capacity_pods", + "node_filesystem_inodes_free", + "node_cpu_utilization", + }, + + "ClusterName-Namespace-Service": { + "pod_status_unknown", + "pod_memory_limit", + "pod_container_status_terminated", + "pod_status_ready", + "pod_number_of_container_restarts", + "pod_status_pending", + "pod_status_succeeded", + "pod_network_rx_bytes", + "pod_status_failed", + "pod_number_of_containers", + "pod_cpu_request", + "service_number_of_running_pods", + "pod_memory_reserved_capacity", + "pod_network_tx_bytes", + "pod_container_status_waiting", + "pod_memory_request", + "pod_status_running", + "pod_container_status_running", + "pod_cpu_reserved_capacity", + "pod_memory_utilization_over_pod_limit", + "pod_cpu_utilization", + "pod_memory_utilization", + "pod_number_of_running_containers", + "pod_status_scheduled", + }, + "ClusterName-Namespace": { + "pod_interface_network_rx_dropped", + "pod_network_rx_bytes", + "pod_cpu_utilization_over_pod_limit", + "pod_memory_utilization_over_pod_limit", + "namespace_number_of_running_pods", + "pod_memory_utilization", + "pod_interface_network_tx_dropped", + "pod_cpu_utilization", + "pod_network_tx_bytes", + }, +} diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go index 998dd7961..c2fab7cac 100644 --- a/util/awsservice/cloudwatchmetrics.go +++ b/util/awsservice/cloudwatchmetrics.go @@ -91,11 +91,12 @@ func ValidateSampleCount(metricName, namespace string, dimensions []types.Dimens } dataPoints := 0 + log.Printf("These are the data points: %v", data) + log.Printf("These are the data points: %v", data.Datapoints) for _, datapoint := range data.Datapoints { dataPoints = dataPoints + int(*datapoint.SampleCount) } - log.Printf("Number of datapoints for start time %v with endtime %v and period %d is %d is inclusive between %d and %d", startTime, endTime, periodInSeconds, dataPoints, lowerBoundInclusive, upperBoundInclusive) if lowerBoundInclusive <= dataPoints && dataPoints <= upperBoundInclusive { @@ -105,22 +106,6 @@ func ValidateSampleCount(metricName, namespace string, dimensions []types.Dimens return false } -// GetMetricData takes the metric name, metric dimension and metric namespace and return the query metrics -func GetMetricData(metricDataQueries []types.MetricDataQuery, startTime, endTime time.Time) (*cloudwatch.GetMetricDataOutput, error) { - getMetricDataInput := cloudwatch.GetMetricDataInput{ - StartTime: &startTime, - EndTime: &endTime, - MetricDataQueries: metricDataQueries, - } - - data, err := CwmClient.GetMetricData(ctx, &getMetricDataInput) - if err != nil { - return nil, err - } - - return data, nil -} - func GetMetricStatistics( metricName string, namespace string, @@ -149,6 +134,22 @@ func GetMetricStatistics( return CwmClient.GetMetricStatistics(ctx, &metricStatsInput) } +// GetMetricData takes the metric name, metric dimension and metric namespace and return the query metrics +func GetMetricData(metricDataQueries []types.MetricDataQuery, startTime, endTime time.Time) (*cloudwatch.GetMetricDataOutput, error) { + getMetricDataInput := cloudwatch.GetMetricDataInput{ + StartTime: &startTime, + EndTime: &endTime, + MetricDataQueries: metricDataQueries, + } + + data, err := CwmClient.GetMetricData(ctx, &getMetricDataInput) + if err != nil { + return nil, err + } + + return data, nil +} + func BuildDimensionFilterList(appendDimension int) []types.DimensionFilter { // we append dimension from 0 to max number - 2 // then we add dimension instance id