diff --git a/generator/resources/ec2_linux_test_matrix.json b/generator/resources/ec2_linux_test_matrix.json index a32c2c8d8..44aacdaaf 100644 --- a/generator/resources/ec2_linux_test_matrix.json +++ b/generator/resources/ec2_linux_test_matrix.json @@ -79,7 +79,7 @@ { "os": "al2023", "username": "ec2-user", - "instanceType":"m7g.medium", + "instanceType":"m6g.medium", "installAgentCommand": "go run ./install/install_agent.go rpm", "ami": "cloudwatch-agent-integration-test-aarch64-al2023*", "caCertPath": "/etc/ssl/certs/ca-bundle.crt", diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index 0431f2996..e17b63e0d 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -11,6 +11,7 @@ import ( "os" "github.com/mitchellh/mapstructure" + "golang.org/x/exp/slices" ) type matrixRow struct { @@ -108,6 +109,10 @@ var testTypeToTestConfig = map[string][]testConfig{ terraformDir: "terraform/ec2/creds", targets: map[string]map[string]struct{}{"os": {"al2": {}}}, }, + { + testDir: "./test/app_signals", + targets: map[string]map[string]struct{}{"os": {"al2": {}}, "arc": {"amd64": {}}}, + }, }, /* You can only place 1 mac instance on a dedicate host a single time. @@ -215,6 +220,29 @@ var testTypeToTestConfig = map[string][]testConfig{ }, } +type partition struct { + configName string + tests []string + ami []string +} + +var partitionTests = map[string]partition{ + "commercial": { + configName: "", + tests: []string{}, + ami: []string{}, + }, + "itar": { + configName: "_itar", + tests: []string{testTypeKeyEc2Linux}, + ami: []string{"cloudwatch-agent-integration-test-aarch64-al2023*"}, + }, + "china": {configName: "_china", + tests: []string{testTypeKeyEc2Linux}, + ami: []string{"cloudwatch-agent-integration-test-aarch64-al2023*"}, + }, +} + func copyAllEC2LinuxTestForOnpremTesting() { /* Some tests need to be fixed in order to run in both environment, so for now for PoC, run one that works. testTypeToTestConfig["ec2_linux_onprem"] = testTypeToTestConfig[testTypeKeyEc2Linux] @@ -231,12 +259,17 @@ func main() { copyAllEC2LinuxTestForOnpremTesting() for testType, testConfigs := range testTypeToTestConfig { - testMatrix := genMatrix(testType, testConfigs) - writeTestMatrixFile(testType, testMatrix) + for _, partition := range partitionTests { + if len(partition.tests) != 0 && !slices.Contains(partition.tests, testType) { + continue + } + testMatrix := genMatrix(testType, testConfigs, partition.ami) + writeTestMatrixFile(testType+partition.configName, testMatrix) + } } } -func genMatrix(testType string, testConfigs []testConfig) []matrixRow { +func genMatrix(testType string, testConfigs []testConfig, ami []string) []matrixRow { openTestMatrix, err := os.Open(fmt.Sprintf("generator/resources/%v_test_matrix.json", testType)) if err != nil { @@ -267,6 +300,10 @@ func genMatrix(testType string, testConfigs []testConfig) []matrixRow { log.Panicf("can't decode map test %v to metric line struct with error %v", testConfig, err) } + if len(ami) != 0 && !slices.Contains(ami, row.Ami) { + continue + } + if testConfig.targets == nil || shouldAddTest(&row, testConfig.targets) { testMatrixComplete = append(testMatrixComplete, row) } diff --git a/localstack/docker-compose.yml b/localstack/docker-compose.yml index dd61c9670..2634bda1d 100644 --- a/localstack/docker-compose.yml +++ b/localstack/docker-compose.yml @@ -4,8 +4,8 @@ services: localstack: container_name: "${LOCALSTACK_DOCKER_NAME-localstack_main}" # @TODO use latest when this is fixed https://github.com/localstack/localstack/issues/5502 - # Use 0.12.20 since this is last version that worked for now - image: localstack/localstack:0.12.20 + # Use 0.13.0 since this is last version that worked for now + image: localstack/localstack:0.13.0 network_mode: bridge ports: - "127.0.0.1:53:53" diff --git a/localstack/ls_tmp/snakeoil.conf b/localstack/ls_tmp/snakeoil.conf index e505585d1..79f406e5c 100644 --- a/localstack/ls_tmp/snakeoil.conf +++ b/localstack/ls_tmp/snakeoil.conf @@ -15,4 +15,6 @@ subjectAltName = @alt_names [alt_names] DNS.1 = localhost DNS.2 = localhost.localstack.cloud -DNS.3 = *.us-west-2.compute.amazonaws.com \ No newline at end of file +DNS.3 = *.us-west-2.compute.amazonaws.com +DNS.4 = *.us-gov-east-1.compute.amazonaws.com +DNS.5 = *.cn-north-1.compute.amazonaws.com \ No newline at end of file diff --git a/terraform/ec2/localstack/main.tf b/terraform/ec2/localstack/main.tf index 450c96ca9..409eaa8d6 100644 --- a/terraform/ec2/localstack/main.tf +++ b/terraform/ec2/localstack/main.tf @@ -52,7 +52,7 @@ resource "aws_instance" "integration-test" { inline = [ "cloud-init status --wait", "clone the agent and start the localstack", - "git clone ${var.github_test_repo}", + "git clone --branch ${var.github_test_repo_branch} ${var.github_test_repo}", "cd amazon-cloudwatch-agent-test", "git reset --hard ${var.cwa_test_github_sha}", "echo set up ssl pem for localstack, then start localstack", @@ -67,7 +67,7 @@ resource "aws_instance" "integration-test" { ] connection { type = "ssh" - user = "ubuntu" + user = "ec2-user" private_key = local.private_key_content host = self.public_dns } @@ -83,6 +83,6 @@ data "aws_ami" "latest" { filter { name = "name" - values = ["cloudwatch-agent-integration-test-ubuntu*"] + values = ["cloudwatch-agent-integration-test-aarch64-al2023*"] } } diff --git a/terraform/ec2/localstack/variables.tf b/terraform/ec2/localstack/variables.tf index a5f8fa401..741e2779d 100644 --- a/terraform/ec2/localstack/variables.tf +++ b/terraform/ec2/localstack/variables.tf @@ -3,7 +3,7 @@ variable "ec2_instance_type" { type = string - default = "t3a.medium" + default = "m6g.medium" } variable "ssh_key_name" { @@ -39,4 +39,8 @@ variable "github_test_repo" { variable "s3_bucket" { type = string default = "" +} + +variable "github_test_repo_branch" { + default = "main" } \ No newline at end of file diff --git a/terraform/ec2/mac/main.tf b/terraform/ec2/mac/main.tf index 1ecdc5b83..3791b008f 100644 --- a/terraform/ec2/mac/main.tf +++ b/terraform/ec2/mac/main.tf @@ -130,6 +130,7 @@ resource "null_resource" "integration_test" { "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:${module.validator.instance_agent_config}", "./validator --validator-config=${module.validator.instance_validator_config} --preparation-mode=false", "cd ~/amazon-cloudwatch-agent-test", + "echo run sanity test && sudo go test ./test/sanity -p 1 -v", "sudo go test ./test/run_as_user -p 1 -timeout 1h -computeType=EC2 -bucket=${var.s3_bucket} -cwaCommitSha=${var.cwa_github_sha} -instanceId=${aws_instance.cwagent.id} -v", ] } diff --git a/terraform/ec2/win/main.tf b/terraform/ec2/win/main.tf index 5d2eb7801..a558676ce 100644 --- a/terraform/ec2/win/main.tf +++ b/terraform/ec2/win/main.tf @@ -230,6 +230,10 @@ resource "null_resource" "integration_test_run_validator" { "powershell.exe -Command \"Start-Sleep -s 60\"", "powershell.exe -Command \"Invoke-WebRequest -Uri http://localhost:9404 -UseBasicParsing\"", "set AWS_REGION=${var.region}", + "git clone --branch ${var.github_test_repo_branch} ${var.github_test_repo}", + "cd amazon-cloudwatch-agent-test", + "go test ./test/sanity -p 1 -v", + "cd ..", "validator.exe --validator-config=${module.validator.instance_validator_config} --preparation-mode=true", var.use_ssm ? "powershell \"& 'C:\\Program Files\\Amazon\\AmazonCloudWatchAgent\\amazon-cloudwatch-agent-ctl.ps1' -a fetch-config -m ec2 -s -c ssm:${local.ssm_parameter_name}\"" : "powershell \"& 'C:\\Program Files\\Amazon\\AmazonCloudWatchAgent\\amazon-cloudwatch-agent-ctl.ps1' -a fetch-config -m ec2 -s -c file:${module.validator.instance_agent_config}\"", "validator.exe --validator-config=${module.validator.instance_validator_config} --preparation-mode=false" diff --git a/terraform/eks/daemon/app_signals/main.tf b/terraform/eks/daemon/app_signals/main.tf index 56aba8a37..d9a2fce9f 100644 --- a/terraform/eks/daemon/app_signals/main.tf +++ b/terraform/eks/daemon/app_signals/main.tf @@ -383,7 +383,7 @@ resource "kubernetes_daemonset" "service" { # Template Files ########################################## locals { - cwagent_config = "../../../../${var.test_dir}/resources/config.json" + cwagent_config = "../../../../${var.test_dir}/agent_configs/config.json" server_consumer = "../../../../${var.test_dir}/resources/metrics/server_consumer.json" client_producer = "../../../../${var.test_dir}/resources/metrics/client_producer.json" traces = "../../../../${var.test_dir}/resources/traces/traces.json" diff --git a/test/app_signals/resources/config.json b/test/app_signals/agent_configs/config.json similarity index 100% rename from test/app_signals/resources/config.json rename to test/app_signals/agent_configs/config.json diff --git a/test/app_signals/app_signals_test.go b/test/app_signals/app_signals_test.go index 01e9976b8..91c824800 100644 --- a/test/app_signals/app_signals_test.go +++ b/test/app_signals/app_signals_test.go @@ -45,6 +45,7 @@ func init() { var ( eksTestRunners []*test_runner.EKSTestRunner + ec2TestRunners []*test_runner.TestRunner ) func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { @@ -53,15 +54,15 @@ func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { eksTestRunners = []*test_runner.EKSTestRunner{ { - Runner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsServerConsumerTestName, "HostedIn.EKS.Cluster"}, + Runner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsServerConsumerTestName, "HostedIn.EKS.Cluster", env.ComputeType}, Env: *env, }, { - Runner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsClientProducerTestName, "HostedIn.EKS.Cluster"}, + Runner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsClientProducerTestName, "HostedIn.EKS.Cluster", env.ComputeType}, Env: *env, }, { - Runner: &AppSignalsTracesRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsTracesTestName, env.EKSClusterName}, + Runner: &AppSignalsTracesRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsTracesTestName, env.EKSClusterName, env.ComputeType}, Env: *env, }, } @@ -69,6 +70,25 @@ func getEksTestRunners(env *environment.MetaData) []*test_runner.EKSTestRunner { return eksTestRunners } +func getEc2TestRunners(env *environment.MetaData) []*test_runner.TestRunner { + if ec2TestRunners == nil { + factory := dimension.GetDimensionFactory(*env) + + ec2TestRunners = []*test_runner.TestRunner{ + { + TestRunner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsServerConsumerTestName, "HostedIn.Environment", env.ComputeType}, + }, + { + TestRunner: &AppSignalsMetricsRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsClientProducerTestName, "HostedIn.Environment", env.ComputeType}, + }, + { + TestRunner: &AppSignalsTracesRunner{test_runner.BaseTestRunner{DimensionFactory: factory}, AppSignalsTracesTestName, "Generic", env.ComputeType}, + }, + } + } + return ec2TestRunners +} + func (suite *AppSignalsTestSuite) TestAllInSuite() { env := environment.GetEnvironmentMetaData() switch env.ComputeType { @@ -77,6 +97,11 @@ func (suite *AppSignalsTestSuite) TestAllInSuite() { for _, testRunner := range getEksTestRunners(env) { testRunner.Run(suite, env) } + case computetype.EC2: + log.Println("Environment compute type is EC2") + for _, testRunner := range getEc2TestRunners(env) { + suite.AddToSuiteResult(testRunner.Run()) + } default: return } diff --git a/test/app_signals/metrics_test.go b/test/app_signals/metrics_test.go index 4771cba10..66d08ac58 100644 --- a/test/app_signals/metrics_test.go +++ b/test/app_signals/metrics_test.go @@ -8,10 +8,14 @@ package app_signals import ( "time" + "github.com/aws/aws-sdk-go-v2/aws" + + "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype" "github.com/aws/amazon-cloudwatch-agent-test/test/metric" "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" + "github.com/aws/amazon-cloudwatch-agent-test/util/common" ) const testRetryCount = 6 @@ -21,12 +25,13 @@ type AppSignalsMetricsRunner struct { test_runner.BaseTestRunner testName string dimensionKey string + computeType computetype.ComputeType } func (t *AppSignalsMetricsRunner) Validate() status.TestGroupResult { metricsToFetch := t.GetMeasuredMetrics() testResults := make([]status.TestResult, len(metricsToFetch)) - instructions := GetInstructionsFromTestName(t.testName) + instructions := GetInstructionsFromTestName(t.testName, t.computeType) for i, metricName := range metricsToFetch { var testResult status.TestResult @@ -59,18 +64,56 @@ func (t *AppSignalsMetricsRunner) GetMeasuredMetrics() []string { } func (e *AppSignalsMetricsRunner) GetAgentConfigFileName() string { - return "" + return "config.json" +} + +func (e *AppSignalsMetricsRunner) SetupAfterAgentRun() error { + // sends metrics data only for EC2 + if e.computeType == computetype.EC2 { + common.RunCommand("pwd") + cmd := `while true; export START_TIME=$(date +%s%N); do + cat ./resources/metrics/server_consumer.json | sed -e "s/START_TIME/$START_TIME/" > server_consumer.json; + curl -H 'Content-Type: application/json' -d @server_consumer.json -i http://127.0.0.1:4316/v1/metrics --verbose; + cat ./resources/metrics/client_producer.json | sed -e "s/START_TIME/$START_TIME/" > client_producer.json; + curl -H 'Content-Type: application/json' -d @client_producer.json -i http://127.0.0.1:4316/v1/metrics --verbose; + sleep 5; done` + return common.RunAsyncCommand(cmd) + } + + return nil } -func GetInstructionsFromTestName(testName string) []dimension.Instruction { +func GetInstructionsFromTestName(testName string, computeType computetype.ComputeType) []dimension.Instruction { + var instructions []dimension.Instruction switch testName { case AppSignalsClientProducerTestName: - return metric.ClientProducerInstructions + instructions = metric.ClientProducerInstructions case AppSignalsServerConsumerTestName: - return metric.ServerConsumerInstructions + instructions = metric.ServerConsumerInstructions default: return nil } + + if computeType == computetype.EKS { + instructions = append(instructions, []dimension.Instruction{ + { + Key: "HostedIn.EKS.Cluster", + Value: dimension.UnknownDimensionValue(), + }, + { + Key: "HostedIn.K8s.Namespace", + Value: dimension.ExpectedDimensionValue{Value: aws.String("default")}, + }, + }...) + } else { + //EC2 + instructions = append(instructions, dimension.Instruction{ + Key: "HostedIn.Environment", + Value: dimension.ExpectedDimensionValue{Value: aws.String("Generic")}, + }) + } + + return instructions } var _ test_runner.ITestRunner = (*AppSignalsMetricsRunner)(nil) diff --git a/test/app_signals/traces_test.go b/test/app_signals/traces_test.go index 6f7471e05..8e6acc9e2 100644 --- a/test/app_signals/traces_test.go +++ b/test/app_signals/traces_test.go @@ -9,29 +9,32 @@ import ( "fmt" "time" + "github.com/aws/amazon-cloudwatch-agent-test/environment/computetype" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" "github.com/aws/amazon-cloudwatch-agent-test/util/awsservice" + "github.com/aws/amazon-cloudwatch-agent-test/util/common" ) const ( lookbackDuration = time.Duration(-5) * time.Minute EKSClusterAnnotation = "HostedIn_EKS_Cluster" + EC2Annotation = "HostedIn_Environment" ) var annotations = map[string]interface{}{ - "aws_remote_target": "remote-target", - "aws_remote_operation": "remote-operation", - "aws_local_service": "service-name", - "aws_remote_service": "service-name-remote", - "HostedIn_K8s_Namespace": "default", - "aws_local_operation": "replaced-operation", + "aws_remote_target": "remote-target", + "aws_remote_operation": "remote-operation", + "aws_local_service": "service-name", + "aws_remote_service": "service-name-remote", + "aws_local_operation": "replaced-operation", } type AppSignalsTracesRunner struct { test_runner.BaseTestRunner testName string - clusterName string + hostedIn string + computeType computetype.ComputeType } func (t *AppSignalsTracesRunner) Validate() status.TestGroupResult { @@ -40,7 +43,15 @@ func (t *AppSignalsTracesRunner) Validate() status.TestGroupResult { Status: status.FAILED, } timeNow := time.Now() - annotations[EKSClusterAnnotation] = t.clusterName + + // "Generic" means EC2 + if t.hostedIn == "Generic" { + annotations[EC2Annotation] = t.hostedIn + } else { + annotations[EKSClusterAnnotation] = t.hostedIn + annotations["HostedIn_K8s_Namespace"] = "default" + } + xrayFilter := awsservice.FilterExpression(annotations) traceIds, err := awsservice.GetTraceIDs(timeNow.Add(lookbackDuration), timeNow, xrayFilter) if err != nil { @@ -71,7 +82,20 @@ func (t *AppSignalsTracesRunner) GetMeasuredMetrics() []string { } func (e *AppSignalsTracesRunner) GetAgentConfigFileName() string { - return "" + return "config.json" +} + +func (e *AppSignalsTracesRunner) SetupAfterAgentRun() error { + // sends metrics data only for EC2 + if e.computeType == computetype.EC2 { + cmd := `while true; chmod +x ./resources/traceid_generator.go; export START_TIME=$(date +%s%N); export TRACE_ID=$(go run ./resources/traceid_generator.go); do + cat ./resources/traces/traces.json | sed -e "s/START_TIME/$START_TIME/" | sed -e "s/TRACE_ID/$TRACE_ID/" > traces.json; + curl -H 'Content-Type: application/json' -d @traces.json -i http://127.0.0.1:4316/v1/traces --verbose; + sleep 5; done` + return common.RunAsyncCommand(cmd) + } + + return nil } var _ test_runner.ITestRunner = (*AppSignalsTracesRunner)(nil) diff --git a/test/metric/app_signals_util.go b/test/metric/app_signals_util.go index f4deafd3b..29c4b51d2 100644 --- a/test/metric/app_signals_util.go +++ b/test/metric/app_signals_util.go @@ -20,14 +20,6 @@ var ( } ServerConsumerInstructions = []dimension.Instruction{ - { - Key: "HostedIn.EKS.Cluster", - Value: dimension.UnknownDimensionValue(), - }, - { - Key: "HostedIn.K8s.Namespace", - Value: dimension.ExpectedDimensionValue{Value: aws.String("default")}, - }, { Key: "Service", Value: dimension.ExpectedDimensionValue{Value: aws.String("service-name")}, @@ -39,14 +31,6 @@ var ( } ClientProducerInstructions = []dimension.Instruction{ - { - Key: "HostedIn.EKS.Cluster", - Value: dimension.UnknownDimensionValue(), - }, - { - Key: "HostedIn.K8s.Namespace", - Value: dimension.ExpectedDimensionValue{Value: aws.String("default")}, - }, { Key: "Service", Value: dimension.ExpectedDimensionValue{Value: aws.String("service-name")}, diff --git a/test/metric/metric_list_query.go b/test/metric/metric_list_query.go index 3bc9099a4..8a3804efe 100644 --- a/test/metric/metric_list_query.go +++ b/test/metric/metric_list_query.go @@ -31,18 +31,54 @@ func (n *MetricListFetcher) Fetch(namespace, metricName string, dimensions []typ listMetricInput := cloudwatch.ListMetricsInput{ Namespace: aws.String(namespace), - MetricName: aws.String(metricName), Dimensions: dims, } + if len(metricName) > 0 { + listMetricInput.MetricName = aws.String(metricName) + } log.Printf("Metric data input: namespace %v, name %v", namespace, metricName) + var metrics []types.Metric + for { + // get a complete list of metrics with given dimensions + output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput) + if err != nil { + return nil, fmt.Errorf("Error getting metric data %v", err) + } + metrics = append(metrics, output.Metrics...) + // nil or empty nextToken means there is no more data to be fetched + nextToken := output.NextToken + if nextToken == nil || *nextToken == "" { + break + } + listMetricInput.NextToken = nextToken + } + log.Printf("total number of metrics fetched: %v", len(metrics)) + return metrics, nil +} + +func (n *MetricListFetcher) FetchByDimension(namespace string, dimensions []types.Dimension) ([]types.Metric, error) { + var dims []types.DimensionFilter + for _, dim := range dimensions { + dims = append(dims, types.DimensionFilter{ + Name: dim.Name, + Value: dim.Value, + }) + } + + listMetricInput := cloudwatch.ListMetricsInput{ + Namespace: aws.String(namespace), + Dimensions: dims, + } + + log.Printf("Metric data input: namespace %v, dimensions %v", namespace, fmt.Sprint(&dims)) output, err := awsservice.CwmClient.ListMetrics(context.Background(), &listMetricInput) if err != nil { return nil, fmt.Errorf("Error getting metric data %v", err) } - log.Printf("Metrics fetched : %s", fmt.Sprint(output)) + log.Printf("Metrics fetched : %v", output.Metrics) return output.Metrics, nil } diff --git a/test/metric/stat.go b/test/metric/stat.go index 763028566..d633985d3 100644 --- a/test/metric/stat.go +++ b/test/metric/stat.go @@ -13,4 +13,6 @@ const ( MAXUMUM Statistics = "Maxmimum" SUM Statistics = "Sum" HighResolutionStatPeriod = 10 + + MinuteStatPeriod = 60 ) diff --git a/test/metric_value_benchmark/eks_daemonset_test.go b/test/metric_value_benchmark/eks_daemonset_test.go index 35e4298a7..ca01673c6 100644 --- a/test/metric_value_benchmark/eks_daemonset_test.go +++ b/test/metric_value_benchmark/eks_daemonset_test.go @@ -10,14 +10,16 @@ import ( "errors" "fmt" "log" + "math/rand" + "sort" + "strings" "time" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" - "golang.org/x/exp/slices" "github.com/aws/amazon-cloudwatch-agent-test/environment" "github.com/aws/amazon-cloudwatch-agent-test/test/metric" - "github.com/aws/amazon-cloudwatch-agent-test/test/metric/dimension" "github.com/aws/amazon-cloudwatch-agent-test/test/metric_value_benchmark/eks_resources" "github.com/aws/amazon-cloudwatch-agent-test/test/status" "github.com/aws/amazon-cloudwatch-agent-test/test/test_runner" @@ -25,22 +27,17 @@ import ( ) const containerInsightsNamespace = "ContainerInsights" - -// list of metrics with more dimensions e.g. PodName and Namespace -var metricsWithMoreDimensions = []string{"pod_number_of_container_restarts"} +const gpuMetricIndicator = "_gpu_" type EKSDaemonTestRunner struct { test_runner.BaseTestRunner - env *environment.MetaData + testName string + env *environment.MetaData } func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult { - metrics := e.GetMeasuredMetrics() - testResults := make([]status.TestResult, 0) - for _, name := range metrics { - testResults = append(testResults, e.validateInstanceMetrics(name)) - } - + var testResults []status.TestResult + testResults = append(testResults, validateMetrics(e.env, gpuMetricIndicator, eks_resources.ExpectedDimsToMetrics)...) testResults = append(testResults, e.validateLogs(e.env)) return status.TestGroupResult{ Name: e.GetTestName(), @@ -48,54 +45,143 @@ func (e *EKSDaemonTestRunner) Validate() status.TestGroupResult { } } -func (e *EKSDaemonTestRunner) validateInstanceMetrics(name string) status.TestResult { - testResult := status.TestResult{ - Name: name, - Status: status.FAILED, +const ( + dimDelimiter = "-" + ContainerInsightsNamespace = "ContainerInsights" +) + +type dimToMetrics struct { + // dim keys as string with dimDelimiter(-) eg. ClusterName-Namespace + dimStr string + // metric names to their dimensions with values. Dimension sets will be used for metric data validations + metrics map[string][][]types.Dimension +} + +func validateMetrics(env *environment.MetaData, metricFilter string, expectedDimsToMetrics map[string][]string) []status.TestResult { + var results []status.TestResult + dimsToMetrics := getMetricsInClusterDimension(env, metricFilter) + //loops through each dimension set and checks if they exit in the cluster(fails if it doesn't) + for dims, metrics := range expectedDimsToMetrics { + var actual map[string][][]types.Dimension + //looping through dtms until we find the dimension string equal to the one in the hard coded map + for _, dtm := range dimsToMetrics { + log.Printf("dtm: %s vs dims %s", dtm.dimStr, dims) //testing purposes + if dtm.dimStr == dims { + actual = dtm.metrics + break + } + } + //if there are no metrics for the dimension set, we fail the test + if len(actual) < 1 { + results = append(results, status.TestResult{ + Name: dims, + Status: status.FAILED, + }) + log.Printf("ValidateMetrics failed with missing dimension set: %s", dims) + // keep testing other dims or fail early? + continue + } + //verifies length of metrics for dimension set + results = append(results, validateMetricsAvailability(dims, metrics, actual)) + for _, m := range metrics { + // picking a random dimension set to test metric data so we don't have to test every dimension set + randIdx := rand.Intn(len(actual[m])) + //verifys values of metrics + results = append(results, validateMetricValue(m, actual[m][randIdx])) + } } + return results +} - dims, failed := e.DimensionFactory.GetDimensions([]dimension.Instruction{ +// Fetches all metrics in cluster +func getMetricsInClusterDimension(env *environment.MetaData, metricFilter string) []dimToMetrics { //map[string]map[string]interface{} { + listFetcher := metric.MetricListFetcher{} + log.Printf("Fetching by cluster dimension") + dims := []types.Dimension{ { - Key: "ClusterName", - Value: dimension.UnknownDimensionValue(), + Name: aws.String("ClusterName"), + Value: aws.String(env.EKSClusterName), }, - }) - if len(failed) > 0 { - log.Println("failed to get dimensions") - return testResult + } + metrics, err := listFetcher.Fetch(ContainerInsightsNamespace, "", dims) + if err != nil { + log.Println("failed to fetch metric list", err) + return nil + } + if len(metrics) < 1 { + log.Println("cloudwatch metric list is empty") + return nil } - // get list of metrics that has more dimensions for container insights - // this is to avoid adding more dimension provider for non-trivial dimensions e.g. PodName - listFetcher := metric.MetricListFetcher{} - if slices.Contains(metricsWithMoreDimensions, name) { - metrics, err := listFetcher.Fetch(containerInsightsNamespace, name, dims) - if err != nil { - log.Println("failed to fetch metric list", err) - return testResult + var results []dimToMetrics + for _, m := range metrics { + // filter by metric name filter(skip gpu validation) + if metricFilter != "" && strings.Contains(*m.MetricName, metricFilter) { + continue } - - if len(metrics) < 1 { - log.Println("metric list is empty") - return testResult + var dims []string + for _, d := range m.Dimensions { + dims = append(dims, *d.Name) } - - // just verify 1 of returned metrics for values - for _, dim := range metrics[0].Dimensions { - // skip since it's provided by dimension provider - if *dim.Name == "ClusterName" { - continue + sort.Sort(sort.StringSlice(dims)) //what's the point of sorting? + dimsKey := strings.Join(dims, dimDelimiter) + log.Printf("processing dims: %s", dimsKey) + + var dtm dimToMetrics + for _, ele := range results { + if ele.dimStr == dimsKey { + dtm = ele + break } + } + if dtm.dimStr == "" { + dtm = dimToMetrics{ + dimStr: dimsKey, + metrics: make(map[string][][]types.Dimension), + } + results = append(results, dtm) + } + dtm.metrics[*m.MetricName] = append(dtm.metrics[*m.MetricName], m.Dimensions) + } + return results +} - dims = append(dims, types.Dimension{ - Name: dim.Name, - Value: dim.Value, - }) +// Check if all metrics from cluster matches hard coded map +func validateMetricsAvailability(dims string, expected []string, actual map[string][][]types.Dimension) status.TestResult { + testResult := status.TestResult{ + Name: dims, + Status: status.FAILED, + } + log.Printf("expected metrics: %d, actual metrics: %d", len(expected), len(actual)) + if compareMetrics(expected, actual) { + testResult.Status = status.SUCCESSFUL + } else { + log.Printf("validateMetricsAvailability failed for %s", dims) + } + return testResult +} + +func compareMetrics(expected []string, actual map[string][][]types.Dimension) bool { + if len(expected) != len(actual) { + return false + } + + for _, key := range expected { + if _, ok := actual[key]; !ok { + return false } } + return true +} +func validateMetricValue(name string, dims []types.Dimension) status.TestResult { + log.Printf("validateMetricValue with metric: %s", name) + testResult := status.TestResult{ + Name: name, + Status: status.FAILED, + } valueFetcher := metric.MetricValueFetcher{} - values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.AVERAGE, metric.HighResolutionStatPeriod) + values, err := valueFetcher.Fetch(containerInsightsNamespace, name, dims, metric.SAMPLE_COUNT, metric.MinuteStatPeriod) if err != nil { log.Println("failed to fetch metrics", err) return testResult @@ -133,6 +219,7 @@ func (e *EKSDaemonTestRunner) validateLogs(env *environment.MetaData) status.Tes nil, &now, awsservice.AssertLogsNotEmpty(), + awsservice.AssertNoDuplicateLogs(), awsservice.AssertPerLog( awsservice.AssertLogSchema(func(message string) (string, error) { var eksClusterType awsservice.EKSClusterType diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index d4e73912a..f1a480665 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -3,7 +3,9 @@ package eks_resources -import _ "embed" +import ( + _ "embed" +) var ( //go:embed test_schemas/cluster.json @@ -52,3 +54,199 @@ var ( "PodNet": eksPodNetSchema, } ) + +// Hard coded map which lists the expected metrics in each dimension set +var ExpectedDimsToMetrics = map[string][]string{ + "ClusterName": { + "pod_number_of_containers", + "node_status_allocatable_pods", + "pod_number_of_container_restarts", + "node_status_condition_unknown", + "node_number_of_running_pods", + "pod_container_status_running", + "node_status_condition_ready", + "pod_status_running", + "node_filesystem_utilization", + "pod_container_status_terminated", + "pod_status_pending", + "pod_cpu_utilization", + "node_filesystem_inodes", + "node_diskio_io_service_bytes_total", + "node_status_condition_memory_pressure", + "container_cpu_utilization", + "service_number_of_running_pods", + "pod_memory_utilization_over_pod_limit", + "node_memory_limit", + "pod_cpu_request", + "pod_interface_network_tx_dropped", + "pod_status_succeeded", + "namespace_number_of_running_pods", + "pod_memory_reserved_capacity", + "node_diskio_io_serviced_total", + "pod_network_rx_bytes", + "node_status_capacity_pods", + "pod_status_unknown", + "cluster_failed_node_count", + "container_memory_utilization", + "node_memory_utilization", + "node_filesystem_inodes_free", + "container_memory_request", + "container_cpu_limit", + "node_memory_reserved_capacity", + "node_interface_network_tx_dropped", + "pod_cpu_utilization_over_pod_limit", + "container_memory_failures_total", + "pod_status_ready", + "pod_number_of_running_containers", + "cluster_node_count", + "pod_memory_request", + "node_cpu_utilization", + "cluster_number_of_running_pods", + "node_memory_working_set", + "pod_status_failed", + "node_status_condition_pid_pressure", + "pod_status_scheduled", + "node_number_of_running_containers", + "node_cpu_limit", + "node_status_condition_disk_pressure", + "pod_cpu_limit", + "pod_memory_limit", + "node_cpu_usage_total", + "pod_cpu_reserved_capacity", + "pod_network_tx_bytes", + "container_memory_limit", + "pod_memory_utilization", + "node_interface_network_rx_dropped", + "node_network_total_bytes", + "container_cpu_utilization_over_container_limit", + "pod_interface_network_rx_dropped", + "pod_container_status_waiting", + "node_cpu_reserved_capacity", + "container_memory_utilization_over_container_limit", + "container_cpu_request", + }, + "ClusterName-FullPodName-Namespace-PodName": { + "pod_network_tx_bytes", + "pod_interface_network_rx_dropped", + "pod_cpu_limit", + "pod_status_succeeded", + "pod_container_status_waiting", + "pod_number_of_running_containers", + "pod_number_of_container_restarts", + "pod_status_pending", + "pod_status_running", + "pod_container_status_running", + "pod_memory_limit", + "pod_status_unknown", + "pod_memory_utilization_over_pod_limit", + "pod_cpu_request", + "pod_status_scheduled", + "pod_memory_utilization", + "pod_status_failed", + "pod_network_rx_bytes", + "pod_number_of_containers", + "pod_cpu_utilization", + "pod_memory_reserved_capacity", + "pod_status_ready", + "pod_container_status_terminated", + "pod_interface_network_tx_dropped", + "pod_memory_request", + "pod_cpu_reserved_capacity", + "pod_cpu_utilization_over_pod_limit", + }, + "ClusterName-Namespace-PodName": { + "pod_interface_network_rx_dropped", + "pod_status_succeeded", + "pod_container_status_running", + "pod_network_rx_bytes", + "pod_cpu_utilization", + "pod_memory_utilization", + "pod_interface_network_tx_dropped", + "pod_status_ready", + "pod_container_status_terminated", + "pod_cpu_reserved_capacity", + "pod_memory_request", + "pod_status_running", + "pod_status_pending", + "pod_number_of_containers", + "pod_memory_utilization_over_pod_limit", + "pod_status_unknown", + "pod_cpu_limit", + "pod_container_status_waiting", + "pod_memory_reserved_capacity", + "pod_network_tx_bytes", + "pod_status_failed", + "pod_number_of_running_containers", + "pod_number_of_container_restarts", + "pod_cpu_request", + "pod_cpu_utilization_over_pod_limit", + "pod_status_scheduled", + "pod_memory_limit", + }, + + "ClusterName-InstanceId-NodeName": { + "node_status_allocatable_pods", + "node_network_total_bytes", + "node_status_condition_unknown", + "node_interface_network_rx_dropped", + "node_number_of_running_containers", + "node_interface_network_tx_dropped", + "node_memory_utilization", + "node_cpu_limit", + "node_status_condition_disk_pressure", + "node_memory_working_set", + "node_cpu_reserved_capacity", + "node_status_condition_ready", + "node_filesystem_utilization", + "node_status_condition_memory_pressure", + "node_memory_limit", + "node_memory_reserved_capacity", + "node_diskio_io_serviced_total", + "node_status_condition_pid_pressure", + "node_filesystem_inodes", + "node_cpu_usage_total", + "node_number_of_running_pods", + "node_diskio_io_service_bytes_total", + "node_status_capacity_pods", + "node_filesystem_inodes_free", + "node_cpu_utilization", + }, + + "ClusterName-Namespace-Service": { + "pod_status_unknown", + "pod_memory_limit", + "pod_container_status_terminated", + "pod_status_ready", + "pod_number_of_container_restarts", + "pod_status_pending", + "pod_status_succeeded", + "pod_network_rx_bytes", + "pod_status_failed", + "pod_number_of_containers", + "pod_cpu_request", + "service_number_of_running_pods", + "pod_memory_reserved_capacity", + "pod_network_tx_bytes", + "pod_container_status_waiting", + "pod_memory_request", + "pod_status_running", + "pod_container_status_running", + "pod_cpu_reserved_capacity", + "pod_memory_utilization_over_pod_limit", + "pod_cpu_utilization", + "pod_memory_utilization", + "pod_number_of_running_containers", + "pod_status_scheduled", + }, + "ClusterName-Namespace": { + "pod_interface_network_rx_dropped", + "pod_network_rx_bytes", + "pod_cpu_utilization_over_pod_limit", + "pod_memory_utilization_over_pod_limit", + "namespace_number_of_running_pods", + "pod_memory_utilization", + "pod_interface_network_tx_dropped", + "pod_cpu_utilization", + "pod_network_tx_bytes", + }, +} diff --git a/util/awsservice/cloudwatchmetrics.go b/util/awsservice/cloudwatchmetrics.go index 998dd7961..c2fab7cac 100644 --- a/util/awsservice/cloudwatchmetrics.go +++ b/util/awsservice/cloudwatchmetrics.go @@ -91,11 +91,12 @@ func ValidateSampleCount(metricName, namespace string, dimensions []types.Dimens } dataPoints := 0 + log.Printf("These are the data points: %v", data) + log.Printf("These are the data points: %v", data.Datapoints) for _, datapoint := range data.Datapoints { dataPoints = dataPoints + int(*datapoint.SampleCount) } - log.Printf("Number of datapoints for start time %v with endtime %v and period %d is %d is inclusive between %d and %d", startTime, endTime, periodInSeconds, dataPoints, lowerBoundInclusive, upperBoundInclusive) if lowerBoundInclusive <= dataPoints && dataPoints <= upperBoundInclusive { @@ -105,22 +106,6 @@ func ValidateSampleCount(metricName, namespace string, dimensions []types.Dimens return false } -// GetMetricData takes the metric name, metric dimension and metric namespace and return the query metrics -func GetMetricData(metricDataQueries []types.MetricDataQuery, startTime, endTime time.Time) (*cloudwatch.GetMetricDataOutput, error) { - getMetricDataInput := cloudwatch.GetMetricDataInput{ - StartTime: &startTime, - EndTime: &endTime, - MetricDataQueries: metricDataQueries, - } - - data, err := CwmClient.GetMetricData(ctx, &getMetricDataInput) - if err != nil { - return nil, err - } - - return data, nil -} - func GetMetricStatistics( metricName string, namespace string, @@ -149,6 +134,22 @@ func GetMetricStatistics( return CwmClient.GetMetricStatistics(ctx, &metricStatsInput) } +// GetMetricData takes the metric name, metric dimension and metric namespace and return the query metrics +func GetMetricData(metricDataQueries []types.MetricDataQuery, startTime, endTime time.Time) (*cloudwatch.GetMetricDataOutput, error) { + getMetricDataInput := cloudwatch.GetMetricDataInput{ + StartTime: &startTime, + EndTime: &endTime, + MetricDataQueries: metricDataQueries, + } + + data, err := CwmClient.GetMetricData(ctx, &getMetricDataInput) + if err != nil { + return nil, err + } + + return data, nil +} + func BuildDimensionFilterList(appendDimension int) []types.DimensionFilter { // we append dimension from 0 to max number - 2 // then we add dimension instance id