diff --git a/examples/eks/eks_cluster_gitops/README.md b/examples/eks/eks_cluster_gitops/README.md index 71a25368..7a695687 100644 --- a/examples/eks/eks_cluster_gitops/README.md +++ b/examples/eks/eks_cluster_gitops/README.md @@ -1,33 +1,153 @@ ## EKS and CAST AI example for GitOps onboarding flow -Following example shows how to onboard EKS cluster to CAST AI using GitOps flow. -In GitOps flow CAST AI Node Configuration, Node Templates and Autoscaler policies are managed using Terraform, but all Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.) +## GitOps flow + +Terraform Managed ==> IAM roles, CAST AI Node Configuration, CAST Node Templates and CAST Autoscaler policies + +Helm Managed ==> All Castware components such as `castai-agent`, `castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner`, `castai-egressd` are to be installed using other means (e.g ArgoCD, manual Helm releases, etc.) + + + +-------------------------+ + | Start | + +-------------------------+ + | + | AWS CLI + +-------------------------+ + | 1.Check EKS Auth Mode is API/API_CONFIGMAP + | + +-------------------------+ + | + | + ----------------------------------------------------- + | YES | NO + | | + +-------------------------+ +-----------------------------------------+ + No action needed from User 2. User to add cast role in aws-auth configmap + + +-------------------------+ +-----------------------------------------+ + | | + | | + ----------------------------------------------------- + | + | + | TERRAFORM + +-------------------------+ + | 3. Update TF.VARS + 4. Terraform Init & Apply| + +-------------------------+ + | + |GITOPS + +-------------------------+ + | 5. Deploy Helm chart of castai-agent castai-cluster-controller`, `castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner` + +-------------------------+ + | + | + +-------------------------+ + | END | + +-------------------------+ -Steps to take to successfully onboard EKS cluster to CAST AI using GitOps flow: Prerequisites: - CAST AI account - Obtained CAST AI [API Access key](https://docs.cast.ai/docs/authentication#obtaining-api-access-key) with Full Access -1. Configure `tf.vars.example` file with required values. If EKS cluster is already managed by Terraform you could instead directly reference those resources. -2. Run `terraform init` -3. Run `terraform apply` and make a note of `cluster_id` and `cluster_token` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console -4. Install CAST AI components using Helm. Use `cluster_id` and `cluster_token` values to configure Helm releases: -- Set `castai.apiKey` property to `cluster_token` for following CAST AI components: `castai-cluster-controller`, `castai-kvisor`. -- Set `additionalEnv.STATIC_CLUSTER_ID` property to `cluster_id` and `apiKey` property to `cluster_token` for `castai-agent`. -- Set `castai.clusterID` property to for `castai-cluster-controller`, `castai-spot-handler`, `castai-kvisor` -Example Helm install command: - -```bash -helm install castai-cluster-controller cluster-controller --namespace=castai-agent --set castai.apiKey=,provider=eks,castai.clusterID=,createNamespace=false,apiURL="https://api.cast.ai" + +### Step 1: Get EKS cluster authentication mode +``` +CLUSTER_NAME="" +REGION="" +current_auth_mode=$(aws eks describe-cluster --name $CLUSTER_NAME --region $REGION | grep authenticationMode | awk '{print $2}') +echo "Authentication mode is $current_auth_mode" ``` -5. Update [aws-auth](https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html) configmap with instance profile used by CAST AI. This instance profile is used by CAST AI managed nodes to communicate with EKS control plane. Example of entry can be found [here](https://github.com/castai/terraform-provider-castai/blob/157babd57b0977f499eb162e9bee27bee51d292a/examples/eks/eks_cluster_assumerole/eks.tf#L28-L38). + +### Step 2: If EKS AUTH mode is API/API_CONFIGMAP, This step can be SKIPPED. +#### User to add cast role in aws-auth configmap, configmap may have other entries, so add the below role to it +``` +apiVersion: v1 +data: + mapRoles: | + - rolearn: arn:aws:iam::028075177508:role/castai-eks- + username: system:node:{{EC2PrivateDNSName}} + groups: + - system:bootstrappers + - system:nodes +kind: ConfigMap +metadata: + name: aws-auth + namespace: kube-system +``` + + +### Step 3 & 4: Update TF vars & TF Init, plan & apply +After successful apply, CAST Console UI will be in `Connecting` state. \ +Note generated 'CASTAI_CLUSTER_ID' from outputs + + +### Step 5: Deploy Helm chart of CAST Components +Coponents: `castai-cluster-controller`,`castai-evictor`, `castai-spot-handler`, `castai-kvisor`, `castai-workload-autoscaler`, `castai-pod-pinner` \ +After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully. + +``` +CASTAI_API_KEY="" +CASTAI_CLUSTER_ID="" +CAST_CONFIG_SOURCE="castai-cluster-controller" + +#### Mandatory Component: Castai-agent +helm upgrade -i castai-agent castai-helm/castai-agent -n castai-agent \ + --set apiKey=$CASTAI_API_KEY \ + --set provider=eks \ + --create-namespace + +#### Mandatory Component: castai-cluster-controller +helm upgrade -i cluster-controller castai-helm/castai-cluster-controller -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set autoscaling.enabled=true + +#### castai-spot-handler +helm upgrade -i castai-spot-handler castai-helm/castai-spot-handler -n castai-agent \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set castai.provider=aws + +#### castai-evictor +helm upgrade -i castai-evictor castai-helm/castai-evictor -n castai-agent --set replicaCount=0 + +#### castai-pod-pinner +helm upgrade -i castai-pod-pinner castai-helm/castai-pod-pinner -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set replicaCount=0 + +#### castai-workload-autoscaler +helm upgrade -i castai-workload-autoscaler castai-helm/castai-workload-autoscaler -n castai-agent \ +--set castai.apiKeySecretRef=$CAST_CONFIG_SOURCE \ +--set castai.configMapRef=$CAST_CONFIG_SOURCE \ + +#### castai-kvisor +helm upgrade -i castai-kvisor castai-helm/castai-kvisor -n castai-agent \ +--set castai.apiKey=$CASTAI_API_KEY \ +--set castai.clusterID=$CASTAI_CLUSTER_ID \ +--set controller.extraArgs.kube-linter-enabled=true \ +--set controller.extraArgs.image-scan-enabled=true \ +--set controller.extraArgs.kube-bench-enabled=true \ +--set controller.extraArgs.kube-bench-cloud-provider=eks +``` + +## Steps Overview + +1. If EKS auth mode is not API/API_CONFIGMAP - Update [aws-auth](https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html) configmap with instance profile used by CAST AI. This instance profile is used by CAST AI managed nodes to communicate with EKS control plane. Example of entry can be found [here](https://github.com/castai/terraform-provider-castai/blob/157babd57b0977f499eb162e9bee27bee51d292a/examples/eks/eks_cluster_assumerole/eks.tf#L28-L38). +2. Configure `tf.vars.example` file with required values. If EKS cluster is already managed by Terraform you could instead directly reference those resources. +3. Run `terraform init` +4. Run `terraform apply` and make a note of `cluster_id` output values. At this stage you would see that your cluster is in `Connecting` state in CAST AI console +5. Install CAST AI components using Helm. Use `cluster_id` and `api_key` values to configure Helm releases: +- Set `castai.apiKey` property to `api_key` +- Set `castai.clusterID` property to `cluster_id` 6. After all CAST AI components are installed in the cluster its status in CAST AI console would change from `Connecting` to `Connected` which means that cluster onboarding process completed successfully. ## Importing already onboarded cluster to Terraform -This example can also be used to import EKS cluster to Terraform which is already onboarded to CAST AI console trough [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works). +This example can also be used to import EKS cluster to Terraform which is already onboarded to CAST AI console through [script](https://docs.cast.ai/docs/cluster-onboarding#how-it-works). For importing existing cluster follow steps 1-3 above and change `castai_node_configuration.default` Node Configuration name. -This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC. +This would allow to manage already onboarded clusters' CAST AI Node Configurations and Node Templates through IaC. \ No newline at end of file diff --git a/examples/eks/eks_cluster_gitops/castai.tf b/examples/eks/eks_cluster_gitops/castai.tf index ec96bbac..57d100f1 100644 --- a/examples/eks/eks_cluster_gitops/castai.tf +++ b/examples/eks/eks_cluster_gitops/castai.tf @@ -1,12 +1,69 @@ -resource "castai_eks_cluster" "my_castai_cluster" { - account_id = var.aws_account_id - region = var.aws_cluster_region - name = var.aws_cluster_name +# Create IAM resources required for connecting cluster to CAST AI. +locals { + resource_name_postfix = var.aws_cluster_name + account_id = data.aws_caller_identity.current.account_id + partition = data.aws_partition.current.partition + + instance_profile_role_name = "castai-eks-${local.resource_name_postfix}-node-role" + iam_role_name = "castai-eks-${local.resource_name_postfix}-cluster-role" + iam_inline_policy_name = "CastEKSRestrictedAccess" + role_name = "castai-eks-role" +} + +data "aws_caller_identity" "current" {} + +data "aws_partition" "current" {} + +data "aws_eks_cluster" "existing_cluster" { + name = var.aws_cluster_name +} + +# Configure EKS cluster connection using CAST AI eks-cluster module. +resource "castai_eks_clusterid" "cluster_id" { + account_id = data.aws_caller_identity.current.account_id + region = var.aws_cluster_region + cluster_name = var.aws_cluster_name +} + +resource "castai_eks_user_arn" "castai_user_arn" { + cluster_id = castai_eks_clusterid.cluster_id.id +} + +module "castai-eks-role-iam" { + source = "castai/eks-role-iam/castai" + + aws_account_id = data.aws_caller_identity.current.account_id + aws_cluster_region = var.aws_cluster_region + aws_cluster_name = var.aws_cluster_name + aws_cluster_vpc_id = var.vpc_id + + castai_user_arn = castai_eks_user_arn.castai_user_arn.arn + create_iam_resources_per_cluster = true +} + +# Creates access entry if eks auth mode is API/API_CONFIGMAP +locals { + access_entry = can(regex("API", data.aws_eks_cluster.existing_cluster.access_config[0].authentication_mode)) +} + +resource "aws_eks_access_entry" "access_entry" { + count = local.access_entry ? 1 : 0 + cluster_name = local.resource_name_postfix + principal_arn = module.castai-eks-role-iam.instance_profile_role_arn + type = "EC2_LINUX" +} + +# Connect eks cluster to CAST AI +resource "castai_eks_cluster" "my_castai_cluster" { + account_id = var.aws_account_id + region = var.aws_cluster_region + name = local.resource_name_postfix delete_nodes_on_disconnect = var.delete_nodes_on_disconnect - assume_role_arn = var.aws_assume_role_arn + assume_role_arn = module.castai-eks-role-iam.role_arn } +# Creates node configuration resource "castai_node_configuration" "default" { cluster_id = castai_eks_cluster.my_castai_cluster.id name = "default" @@ -14,12 +71,121 @@ resource "castai_node_configuration" "default" { min_disk_size = 100 subnets = var.subnets eks { - security_groups = var.security_groups - instance_profile_arn = var.instance_profile + security_groups = [ + var.cluster_security_group_id, + var.node_security_group_id + ] + instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn } } + +# Promotes node configuration as default node configuration resource "castai_node_configuration_default" "this" { cluster_id = castai_eks_cluster.my_castai_cluster.id configuration_id = castai_node_configuration.default.id -} \ No newline at end of file +} + +resource "castai_node_template" "default_by_castai" { + cluster_id = castai_eks_cluster.my_castai_cluster.id + + name = "default-by-castai" + is_default = true + is_enabled = true + configuration_id = castai_node_configuration.default.id + should_taint = false + + constraints { + on_demand = true + } + +} + +resource "castai_node_template" "example_spot_template" { + cluster_id = castai_eks_cluster.my_castai_cluster.id + + name = "example_spot_template" + is_default = false + is_enabled = true + configuration_id = castai_node_configuration.default.id + should_taint = true + + custom_labels = { + type = "spot" + } + + custom_taints { + key = "dedicated" + value = "spot" + effect = "NoSchedule" + } + + constraints { + spot = true + use_spot_fallbacks = true + fallback_restore_rate_seconds = 1800 + enable_spot_diversity = true + spot_diversity_price_increase_limit_percent = 20 + spot_interruption_predictions_enabled = true + spot_interruption_predictions_type = "aws-rebalance-recommendations" + is_gpu_only = false + min_cpu = 2 + max_cpu = 16 + min_memory = 4096 + max_memory = 24576 + architectures = ["amd64"] + azs = ["eu-central-1a", "eu-central-1b"] + customer_specific = "disabled" + + instance_families { + exclude = ["m5"] + } + + custom_priority { + instance_families = ["c5"] + spot = true + } + } + +} + +resource "castai_autoscaler" "castai_autoscaler_policy" { + cluster_id = castai_eks_cluster.my_castai_cluster.id + + autoscaler_settings { + enabled = true + is_scoped_mode = false + node_templates_partial_matching_enabled = false + + unschedulable_pods { + enabled = true + } + + cluster_limits { + enabled = false + + cpu { + min_cores = 1 + max_cores = 200 + } + } + + node_downscaler { + enabled = true + + empty_nodes { + enabled = true + } + + evictor { + enabled = true + aggressive_mode = false + cycle_interval = "60s" + dry_run = false + + node_grace_period_minutes = 10 + scoped_mode = false + } + } + } +} diff --git a/examples/eks/eks_cluster_gitops/outputs.tf b/examples/eks/eks_cluster_gitops/outputs.tf index 23bd0fff..b04e9533 100644 --- a/examples/eks/eks_cluster_gitops/outputs.tf +++ b/examples/eks/eks_cluster_gitops/outputs.tf @@ -1,10 +1,21 @@ output "cluster_id" { - value = castai_eks_cluster.my_castai_cluster.id + value = castai_eks_clusterid.cluster_id.id description = "CAST AI cluster ID" } -output "cluster_token" { - value = castai_eks_cluster.my_castai_cluster.cluster_token - description = "CAST AI cluster token used by Castware to atuhenticate to Mothership" - sensitive = true -} \ No newline at end of file +output "instance_profile_role_arn" { + description = "Arn of created cast instance role" + value = module.castai-eks-role-iam.instance_profile_role_arn +} + + +output "instance_profile_arn" { + description = "Arn of created cast instanceprofile role" + value = module.castai-eks-role-iam.instance_profile_arn +} + + +output "cast_role_arn" { + description = "Arn of created cast role" + value = module.castai-eks-role-iam.role_arn +} diff --git a/examples/eks/eks_cluster_gitops/providers.tf b/examples/eks/eks_cluster_gitops/providers.tf index cc1f004d..8627def4 100644 --- a/examples/eks/eks_cluster_gitops/providers.tf +++ b/examples/eks/eks_cluster_gitops/providers.tf @@ -1,4 +1,8 @@ provider "castai" { api_url = var.castai_api_url api_token = var.castai_api_token -} \ No newline at end of file +} +provider "aws" { + region = var.aws_cluster_region + profile = var.profile +} diff --git a/examples/eks/eks_cluster_gitops/tf.vars.example b/examples/eks/eks_cluster_gitops/tf.vars.example index 778f311a..80db8c2c 100644 --- a/examples/eks/eks_cluster_gitops/tf.vars.example +++ b/examples/eks/eks_cluster_gitops/tf.vars.example @@ -2,7 +2,7 @@ castai_api_token = "PLACEHOLDER" aws_account_id = "PLACEHOLDER" aws_cluster_region = "PLACEHOLDER" aws_cluster_name = "PLACEHOLDER" -aws_assume_role_arn = "PLACEHOLDER" subnets = ["PLACEHOLDER1", "PLACEHOLDER2"] -security_groups = ["PLACEHOLDER1", "PLACEHOLDER2"] -instance_profile = "PLACEHOLDER" \ No newline at end of file +vpc_id = "PLACEHOLDER" +cluster_security_group_id = ["PLACEHOLDER1"] +node_security_group_id = ["PLACEHOLDER1] diff --git a/examples/eks/eks_cluster_gitops/variables.tf b/examples/eks/eks_cluster_gitops/variables.tf index fbf7c58d..149a22ce 100644 --- a/examples/eks/eks_cluster_gitops/variables.tf +++ b/examples/eks/eks_cluster_gitops/variables.tf @@ -31,17 +31,26 @@ variable "subnets" { description = "Subnet IDs used by CAST AI to provision nodes" } -variable "security_groups" { - type = list(string) - description = "Security Groups IDs used by CAST AI nodes to connect to K8s control plane, other nodes and have outbound access to Internet" +variable "cluster_security_group_id" { + type = string + description = "EKS cluster security group ID" } -variable "instance_profile" { +variable "node_security_group_id" { type = string - description = "Instance profile ARN used by CAST AI provisioned nodes to connect to K8s control plane" + description = "EKS cluster node security group ID" } +variable "vpc_id" { + type = string + description = "EKS cluster VPC ID" +} +variable "profile" { + type = string + description = "Profile used with AWS CLI" + default = "default" +} ## Optional variables. @@ -55,4 +64,4 @@ variable "delete_nodes_on_disconnect" { type = bool description = "Optionally delete Cast AI created nodes when the cluster is destroyed" default = false -} \ No newline at end of file +}