Skip to content

Commit

Permalink
Adding EKS-Addon Gpu test (aws#411)
Browse files Browse the repository at this point in the history
  • Loading branch information
Paramadon authored Jun 13, 2024
1 parent 515b327 commit 2cd967b
Show file tree
Hide file tree
Showing 14 changed files with 330 additions and 35 deletions.
10 changes: 10 additions & 0 deletions generator/resources/eks_addon_test_matrix.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
{
"k8s_version": "1.29",
"addon_name":"amazon-cloudwatch-observability",
"addon_version":"v1.6.0-eksbuild.1",
"ami_type": "AL2_x86_64_GPU",
"terraform_dir": "terraform/eks/addon/gpu",
"test_dir": "../../../../test/gpu"
}
]
14 changes: 10 additions & 4 deletions generator/test_case_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,17 @@ var testTypeToTestConfig = map[string][]testConfig{
targets: map[string]map[string]struct{}{"metadataEnabled": {"enabled": {}}},
},
},
"eks_addon": {
{
testDir: "../../../../test/gpu",
terraformDir: "terraform/eks/addon/gpu",
},
},
"eks_daemon": {
{
testDir: "./test/metric_value_benchmark",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
testDir: "./test/metric_value_benchmark",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
instanceType: "g4dn.xlarge",
},
{
testDir: "./test/metric_value_benchmark",
Expand Down Expand Up @@ -210,8 +217,7 @@ var testTypeToTestConfig = map[string][]testConfig{
{testDir: "./test/fluent", terraformDir: "terraform/eks/daemon/fluent/windows/2022"},
{
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
instanceType: "g4dn.xlarge",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
},
"eks_deployment": {
Expand Down
29 changes: 29 additions & 0 deletions terraform/eks/addon/gpu/gpuBurner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
kind: Deployment
apiVersion: apps/v1
metadata:
name: gpu-burn
namespace: amazon-cloudwatch
labels:
app: gpu-burn
spec:
replicas: 1
selector:
matchLabels:
app: gpu-burn
template:
metadata:
labels:
app: gpu-burn
spec:
containers:
- name: main
image: oguzpastirmaci/gpu-burn
imagePullPolicy: IfNotPresent
command:
- bash
- '-c'
- while true; do /app/gpu_burn 20; sleep 20; done
resources:
limits:
nvidia.com/gpu: 1

134 changes: 134 additions & 0 deletions terraform/eks/addon/gpu/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

module "common" {
source = "../../../common"
}

module "basic_components" {
source = "../../../basic_components"
region = var.region
}


data "aws_eks_cluster_auth" "this" {
name = aws_eks_cluster.this.name
}

locals {
role_arn = format("%s%s", module.basic_components.role_arn, var.beta ? "-eks-beta" : "")
aws_eks = format("%s%s", "aws eks --region ${var.region}", var.beta ? " --endpoint ${var.beta_endpoint}" : "")
}

resource "aws_eks_cluster" "this" {
name = "cwagent-operator-eks-integ-${module.common.testing_id}"
role_arn = local.role_arn
version = var.k8s_version
enabled_cluster_log_types = [
"api",
"audit",
"authenticator",
"controllerManager",
"scheduler"
]
vpc_config {
subnet_ids = module.basic_components.public_subnet_ids
security_group_ids = [module.basic_components.security_group]
}
}

# EKS Node Groups
resource "aws_eks_node_group" "this" {
cluster_name = aws_eks_cluster.this.name
node_group_name = "cwagent-operator-eks-integ-node"
node_role_arn = aws_iam_role.node_role.arn
subnet_ids = module.basic_components.public_subnet_ids

scaling_config {
desired_size = 2
max_size = 2
min_size = 2
}

ami_type = "AL2_x86_64_GPU"
capacity_type = "ON_DEMAND"
disk_size = 20
instance_types = [var.instance_type]

depends_on = [
aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
aws_iam_role_policy_attachment.node_CloudWatchAgentServerPolicy
]
}

# EKS Node IAM Role
resource "aws_iam_role" "node_role" {
name = "cwagent-operator-eks-Worker-Role-${module.common.testing_id}"

assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node_role.name
}

resource "aws_iam_role_policy_attachment" "node_CloudWatchAgentServerPolicy" {
policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
role = aws_iam_role.node_role.name
}


resource "null_resource" "kubectl" {
depends_on = [
aws_eks_cluster.this,
aws_eks_node_group.this
]
provisioner "local-exec" {
command = <<-EOT
${local.aws_eks} update-kubeconfig --name ${aws_eks_cluster.this.name}
${local.aws_eks} list-clusters --output text
${local.aws_eks} describe-cluster --name ${aws_eks_cluster.this.name} --output text
EOT
}
}

resource "aws_eks_addon" "this" {
depends_on = [
null_resource.kubectl
]
addon_name = var.addon_name
cluster_name = aws_eks_cluster.this.name
addon_version = var.addon_version
}
output "eks_cluster_name" {
value = aws_eks_cluster.this.name
}



20 changes: 20 additions & 0 deletions terraform/eks/addon/gpu/providers.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

provider "aws" {
region = var.region
endpoints {
eks = var.beta ? var.beta_endpoint : null
}
}

provider "kubernetes" {
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
}
host = aws_eks_cluster.this.endpoint
cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
token = data.aws_eks_cluster_auth.this.token
}
47 changes: 47 additions & 0 deletions terraform/eks/addon/gpu/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

variable "region" {
type = string
default = "us-west-2"
}

variable "test_dir" {
type = string
default = "../../../../test/gpu"
}

variable "addon_name" {
type = string
default = "amazon-cloudwatch-observability"
}

variable "addon_version" {
type = string
default = "v1.6.0-eksbuild.1"
}

variable "k8s_version" {
type = string
default = "1.29"
}

variable "ami_type" {
type = string
default = "AL2_x86_64_GPU"
}

variable "instance_type" {
type = string
default = "g4dn.xlarge"
}

variable "beta" {
type = bool
default = true
}

variable "beta_endpoint" {
type = string
default = "https://api.beta.us-west-2.wesley.amazonaws.com"
}
13 changes: 9 additions & 4 deletions terraform/eks/daemon/gpu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -701,19 +701,24 @@ resource "kubernetes_cluster_role_binding" "rolebinding" {
namespace = "amazon-cloudwatch"
}
}

resource "null_resource" "validator" {
depends_on = [
aws_eks_node_group.this,
kubernetes_daemonset.service,
kubernetes_cluster_role_binding.rolebinding,
kubernetes_service_account.cwagentservice,
]

provisioner "local-exec" {
command = <<-EOT
echo "Validating EKS metrics/logs for EMF"
cd ../../../..
go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia
i=0
while [ $i -lt 10 ]; do
i=$((i+1))
go test ${var.test_dir} -eksClusterName=${aws_eks_cluster.this.name} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia && exit 0
sleep 60
done
exit 1
EOT
}
}
}
Loading

0 comments on commit 2cd967b

Please sign in to comment.