From 618d981fef0cd86212232bb5543cb55cfee16941 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:10:09 -0500 Subject: [PATCH 1/3] Fix GPU E2E integ test (#437) --- .../resources/eks_addon_test_matrix.json | 9 +-- .../resources/eks_daemon_test_matrix.json | 4 +- .../resources/eks_deployment_test_matrix.json | 2 +- generator/test_case_generator.go | 2 +- terraform/eks/addon/gpu/main.tf | 17 +++-- terraform/eks/addon/gpu/variables.tf | 4 +- test/gpu/nvidia_test.go | 63 ++++--------------- test/metric/container_insights_util.go | 3 + 8 files changed, 33 insertions(+), 71 deletions(-) diff --git a/generator/resources/eks_addon_test_matrix.json b/generator/resources/eks_addon_test_matrix.json index 6a84253ea..d41508770 100644 --- a/generator/resources/eks_addon_test_matrix.json +++ b/generator/resources/eks_addon_test_matrix.json @@ -1,10 +1,11 @@ [ { - "k8s_version": "1.29", + "k8sVersion": "1.31", "addon_name":"amazon-cloudwatch-observability", - "addon_version":"v1.6.0-eksbuild.1", - "ami_type": "AL2_x86_64_GPU", + "addon_version":"v2.5.0-eksbuild.1", + "ami": "AL2_x86_64_GPU", "terraform_dir": "terraform/eks/addon/gpu", - "test_dir": "../../../../test/gpu" + "test_dir": "../../../../test/gpu", + "instanceType":"g4dn.xlarge" } ] \ No newline at end of file diff --git a/generator/resources/eks_daemon_test_matrix.json b/generator/resources/eks_daemon_test_matrix.json index 82bb2c6bb..245c75b16 100644 --- a/generator/resources/eks_daemon_test_matrix.json +++ b/generator/resources/eks_daemon_test_matrix.json @@ -1,12 +1,12 @@ [ { - "k8s_version": "1.24", + "k8sVersion": "1.31", "ami": "AL2_x86_64", "instanceType":"t3.medium", "arc": "amd64" }, { - "k8s_version": "1.24", + "k8sVersion": "1.31", "ami": "AL2_ARM_64", "instanceType":"m6g.large", "arc": "arm64" diff --git a/generator/resources/eks_deployment_test_matrix.json b/generator/resources/eks_deployment_test_matrix.json index 603357f44..1b250f5c7 100644 --- a/generator/resources/eks_deployment_test_matrix.json +++ b/generator/resources/eks_deployment_test_matrix.json @@ -1,5 +1,5 @@ [ { - "k8sVersion": "1.24" + "k8sVersion": "1.31" } ] \ No newline at end of file diff --git a/generator/test_case_generator.go b/generator/test_case_generator.go index a811e7c5c..11490e054 100644 --- a/generator/test_case_generator.go +++ b/generator/test_case_generator.go @@ -28,7 +28,7 @@ type matrixRow struct { AgentStartCommand string `json:"agentStartCommand"` CaCertPath string `json:"caCertPath"` ValuesPerMinute int `json:"values_per_minute"` // Number of metrics to be sent or number of log lines to write - K8sVersion string `json:"k8s_version"` + K8sVersion string `json:"k8sVersion"` TerraformDir string `json:"terraform_dir"` UseSSM bool `json:"useSSM"` ExcludedTests string `json:"excludedTests"` diff --git a/terraform/eks/addon/gpu/main.tf b/terraform/eks/addon/gpu/main.tf index e63d456b3..d56713d7b 100644 --- a/terraform/eks/addon/gpu/main.tf +++ b/terraform/eks/addon/gpu/main.tf @@ -21,7 +21,7 @@ locals { } resource "aws_eks_cluster" "this" { - name = "cwagent-operator-eks-integ-${module.common.testing_id}" + name = "cwagent-addon-eks-integ-${module.common.testing_id}" role_arn = local.role_arn version = var.k8s_version enabled_cluster_log_types = [ @@ -40,17 +40,17 @@ resource "aws_eks_cluster" "this" { # EKS Node Groups resource "aws_eks_node_group" "this" { cluster_name = aws_eks_cluster.this.name - node_group_name = "cwagent-operator-eks-integ-node" + node_group_name = "cwagent-addon-eks-integ-node" node_role_arn = aws_iam_role.node_role.arn subnet_ids = module.basic_components.public_subnet_ids scaling_config { - desired_size = 2 - max_size = 2 - min_size = 2 + desired_size = 1 + max_size = 1 + min_size = 1 } - ami_type = "AL2_x86_64_GPU" + ami_type = var.ami_type capacity_type = "ON_DEMAND" disk_size = 20 instance_types = [var.instance_type] @@ -65,7 +65,7 @@ resource "aws_eks_node_group" "this" { # EKS Node IAM Role resource "aws_iam_role" "node_role" { - name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}" + name = "cwagent-addon-eks-Worker-Role-${module.common.testing_id}" assume_role_policy = < Date: Wed, 4 Dec 2024 00:10:53 +0530 Subject: [PATCH 2/3] Updated thresholds for Stress Test (#439) --- .../validators/stress/stress_validator.go | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/validator/validators/stress/stress_validator.go b/validator/validators/stress/stress_validator.go index 440e821fb..bf3956929 100644 --- a/validator/validators/stress/stress_validator.go +++ b/validator/validators/stress/stress_validator.go @@ -31,9 +31,9 @@ var ( "1000": { "statsd": { "procstat_cpu_usage": float64(25), - "procstat_memory_rss": float64(82000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(83000000), "procstat_num_fds": float64(11), "net_bytes_sent": float64(105000), @@ -41,9 +41,9 @@ var ( }, "collectd": { "procstat_cpu_usage": float64(20), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(82000000), "procstat_num_fds": float64(11), "net_bytes_sent": float64(102000), @@ -51,19 +51,19 @@ var ( }, "logs": { "procstat_cpu_usage": float64(250), - "procstat_memory_rss": float64(220000000), + "procstat_memory_rss": float64(300000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(888000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(260000000), "procstat_num_fds": float64(110), "net_bytes_sent": float64(1800000), "net_packets_sent": float64(5000), }, "system": { - "procstat_cpu_usage": float64(15), - "procstat_memory_rss": float64(80000000), + "procstat_cpu_usage": float64(16), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(75000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), @@ -71,9 +71,9 @@ var ( }, "emf": { "procstat_cpu_usage": float64(15), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(75000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), @@ -83,9 +83,9 @@ var ( "5000": { "statsd": { "procstat_cpu_usage": float64(100), - "procstat_memory_rss": float64(130000000), + "procstat_memory_rss": float64(150000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(888000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(145000000), "procstat_num_fds": float64(15), "net_bytes_sent": float64(524000), @@ -95,7 +95,7 @@ var ( "procstat_cpu_usage": float64(90), "procstat_memory_rss": float64(120000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(888000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(135000000), "procstat_num_fds": float64(17), "net_bytes_sent": float64(490000), @@ -105,7 +105,7 @@ var ( "procstat_cpu_usage": float64(400), "procstat_memory_rss": float64(540000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(1100000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(540000000), "procstat_num_fds": float64(180), "net_bytes_sent": float64(6500000), @@ -113,9 +113,9 @@ var ( }, "system": { "procstat_cpu_usage": float64(15), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(75000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), @@ -123,9 +123,9 @@ var ( }, "emf": { "procstat_cpu_usage": float64(25), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(79000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), @@ -135,10 +135,10 @@ var ( "10000": { "statsd": { "procstat_cpu_usage": float64(150), - "procstat_memory_rss": float64(160000000), + "procstat_memory_rss": float64(180000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(888000000), - "procstat_memory_data": float64(177000000), + "procstat_memory_vms": float64(1200000000), + "procstat_memory_data": float64(200000000), "procstat_num_fds": float64(17), "net_bytes_sent": float64(980000), "net_packets_sent": float64(860), @@ -147,7 +147,7 @@ var ( "procstat_cpu_usage": float64(120), "procstat_memory_rss": float64(130000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(888000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(150000000), "procstat_num_fds": float64(17), "net_bytes_sent": float64(760000), @@ -165,22 +165,22 @@ var ( }, "system": { "procstat_cpu_usage": float64(15), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(75000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), - "net_packets_sent": float64(100), + "net_packets_sent": float64(120), }, "emf": { "procstat_cpu_usage": float64(45), - "procstat_memory_rss": float64(88000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(88000000), "procstat_num_fds": float64(12), - "net_bytes_sent": float64(90000), + "net_bytes_sent": float64(110000), "net_packets_sent": float64(120), }, }, @@ -192,10 +192,10 @@ var ( "50000": { "statsd": { "procstat_cpu_usage": float64(250), - "procstat_memory_rss": float64(300000000), + "procstat_memory_rss": float64(400000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(1000000000), - "procstat_memory_data": float64(440000000), + "procstat_memory_vms": float64(2000000000), + "procstat_memory_data": float64(600000000), "procstat_num_fds": float64(18), "net_bytes_sent": float64(1700000), "net_packets_sent": float64(1400), @@ -204,7 +204,7 @@ var ( "procstat_cpu_usage": float64(220), "procstat_memory_rss": float64(218000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(980000000), + "procstat_memory_vms": float64(1300000000), "procstat_memory_data": float64(240000000), "procstat_num_fds": float64(18), "net_bytes_sent": float64(1250000), @@ -222,9 +222,9 @@ var ( }, "system": { "procstat_cpu_usage": float64(15), - "procstat_memory_rss": float64(80000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(75000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(90000), @@ -232,9 +232,9 @@ var ( }, "emf": { "procstat_cpu_usage": float64(165), - "procstat_memory_rss": float64(120000000), + "procstat_memory_rss": float64(110000000), "procstat_memory_swap": float64(0), - "procstat_memory_vms": float64(818000000), + "procstat_memory_vms": float64(1200000000), "procstat_memory_data": float64(110000000), "procstat_num_fds": float64(12), "net_bytes_sent": float64(280000), From 070c37d45c0da59f60961762c5e263b0740b51f3 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:43:50 -0500 Subject: [PATCH 3/3] update list of metrics used for metric benchmark validations (#440) --- test/metric_value_benchmark/eks_resources/util.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/metric_value_benchmark/eks_resources/util.go b/test/metric_value_benchmark/eks_resources/util.go index 3e996a60e..705393c78 100644 --- a/test/metric_value_benchmark/eks_resources/util.go +++ b/test/metric_value_benchmark/eks_resources/util.go @@ -171,6 +171,7 @@ func GetExpectedDimsToMetrics(env *environment.MetaData) map[string][]string { "container_cpu_request", "pod_cpu_usage_total", "pod_memory_working_set", + "pod_container_status_waiting_reason_crash_loop_back_off", }, "ClusterName-FullPodName-Namespace-PodName": { "pod_network_tx_bytes", @@ -202,6 +203,7 @@ func GetExpectedDimsToMetrics(env *environment.MetaData) map[string][]string { "pod_cpu_utilization_over_pod_limit", "pod_cpu_usage_total", "pod_memory_working_set", + "pod_container_status_waiting_reason_crash_loop_back_off", }, "ClusterName-Namespace-PodName": { "pod_interface_network_rx_dropped", @@ -233,6 +235,7 @@ func GetExpectedDimsToMetrics(env *environment.MetaData) map[string][]string { "pod_memory_limit", "pod_cpu_usage_total", "pod_memory_working_set", + "pod_container_status_waiting_reason_crash_loop_back_off", }, "ClusterName-InstanceId-NodeName": { @@ -301,8 +304,6 @@ func GetExpectedDimsToMetrics(env *environment.MetaData) map[string][]string { "pod_interface_network_tx_dropped", "pod_cpu_utilization", "pod_network_tx_bytes", - "pod_cpu_usage_total", - "pod_memory_working_set", }, }