From a8a81d9bbb2e730e06fa605d322b33c3064f8dc7 Mon Sep 17 00:00:00 2001 From: anthonyvg9 <124307868+anthonyvg9@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:23:48 -0700 Subject: [PATCH 1/4] init eks-cluster-existing --- examples/eks/eks_cluster_existing/README.MD | 29 +++ examples/eks/eks_cluster_existing/castai.tf | 172 ++++++++++++++++++ .../eks/eks_cluster_existing/providers.tf | 15 ++ .../eks/eks_cluster_existing/variables.tf | 55 ++++++ examples/eks/eks_cluster_existing/versions.tf | 17 ++ 5 files changed, 288 insertions(+) create mode 100644 examples/eks/eks_cluster_existing/README.MD create mode 100644 examples/eks/eks_cluster_existing/castai.tf create mode 100644 examples/eks/eks_cluster_existing/providers.tf create mode 100644 examples/eks/eks_cluster_existing/variables.tf create mode 100644 examples/eks/eks_cluster_existing/versions.tf diff --git a/examples/eks/eks_cluster_existing/README.MD b/examples/eks/eks_cluster_existing/README.MD new file mode 100644 index 00000000..75bb01f9 --- /dev/null +++ b/examples/eks/eks_cluster_existing/README.MD @@ -0,0 +1,29 @@ +## Existing EKS cluster and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations for + +Following example shows how to onboard existing EKS cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/). + +IAM policies required to connect the cluster to CAST AI in the example are created by [castai/eks-role-iam/castai module](https://github.com/castai/terraform-castai-eks-role-iam). + +Example configuration should be analysed in the following order: +1. Creates IAM and other CAST AI related resources to connect EKS cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf` + +# Usage +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your cluster name, cluster region, vpc_id, cluster_security_group_id, node_security_group_id, subnets and CAST AI API token. +3. Initialize Terraform. Under example root folder run: +``` +terraform init +``` +4. Run Terraform apply: +``` +terraform apply -var-file=tf.vars +``` +5. To destroy resources created by this example: +``` +terraform destroy -var-file=tf.vars +``` + +> **Note** +> +> If you are onboarding existing cluster to CAST AI you need to also update [aws-auth](https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html) configmap. In the configmap instance profile +> used by CAST AI has to be present. Example of entry can be found [here](https://github.com/castai/terraform-provider-castai/blob/157babd57b0977f499eb162e9bee27bee51d292a/examples/eks/eks_cluster_autoscaler_polices/eks.tf#L28-L38). diff --git a/examples/eks/eks_cluster_existing/castai.tf b/examples/eks/eks_cluster_existing/castai.tf new file mode 100644 index 00000000..39ae67a6 --- /dev/null +++ b/examples/eks/eks_cluster_existing/castai.tf @@ -0,0 +1,172 @@ +# Configure Data sources and providers required for CAST AI connection. +data "aws_caller_identity" "current" {} + +data "aws_eks_cluster" "existing_cluster" { + name = var.cluster_name # Replace with the actual name of your EKS cluster +} + +resource "castai_eks_user_arn" "castai_user_arn" { + cluster_id = castai_eks_clusterid.cluster_id.id +} + + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +provider "helm" { + kubernetes { + host = data.aws_eks_cluster.existing_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.existing_cluster.certificate_authority.0.data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed. + args = ["eks", "get-token", "--cluster-name", var.cluster_name, "--region", var.cluster_region] + } + } +} + +# Create AWS IAM policies and a user to connect to CAST AI. +module "castai-eks-role-iam" { + source = "castai/eks-role-iam/castai" + + aws_account_id = data.aws_caller_identity.current.account_id + aws_cluster_region = var.cluster_region + aws_cluster_name = var.cluster_name + aws_cluster_vpc_id = var.vpc_id + + castai_user_arn = castai_eks_user_arn.castai_user_arn.arn + + create_iam_resources_per_cluster = true +} + +# Configure EKS cluster connection using CAST AI eks-cluster module. +resource "castai_eks_clusterid" "cluster_id" { + account_id = data.aws_caller_identity.current.account_id + region = var.cluster_region + cluster_name = var.cluster_name +} + +module "castai-eks-cluster" { + source = "castai/eks-cluster/castai" + + api_url = var.castai_api_url + castai_api_token = var.castai_api_token + wait_for_cluster_ready = true + + aws_account_id = data.aws_caller_identity.current.account_id + aws_cluster_region = var.cluster_region + aws_cluster_name = var.cluster_name + + aws_assume_role_arn = module.castai-eks-role-iam.role_arn + delete_nodes_on_disconnect = var.delete_nodes_on_disconnect + + default_node_configuration = module.castai-eks-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + subnets = var.subnets + tags = var.tags + security_groups = [ + var.cluster_security_group_id, + var.node_security_group_id + ] + instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn + } + } + + + node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-eks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = false + use_spot_fallbacks = false + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + + spot_interruption_predictions_enabled = false + spot_interruption_predictions_type = "aws-rebalance-recommendations" + } + } + } + + # Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. + # Here: + # - unschedulablePods - Unscheduled pods policy + # - nodeDownscaler - Node deletion policy + autoscaler_policies_json = <<-EOT + { + "enabled" : false, + "isScopedMode" : false, + "unschedulablePods" : { + "enabled" : false + }, + "clusterLimits" : { + "enabled" : false + }, + "nodeDownscaler" : { + "enabled" : false, + "emptyNodes" : { + "enabled" : false, + "delaySeconds" : 300 + }, + "evictor" : { + "enabled" : false, + "aggressiveMode" : false, + "nodeGracePeriodMinutes" : 5 + } + } +} + + EOT + + # depends_on helps Terraform with creating proper dependencies graph in case of resource creation and in this case destroy. + # module "castai-eks-cluster" has to be destroyed before module "castai-eks-role-iam". + depends_on = [module.castai-eks-role-iam] +} + +resource "castai_rebalancing_schedule" "test" { + name = "rebalance spots at every 30th minute" + schedule { + cron = "*/30 * * * *" + } + trigger_conditions { + savings_percentage = 20 + } + launch_configuration { + # only consider instances older than 5 minutes + node_ttl_seconds = 300 + num_targeted_nodes = 3 + rebalancing_min_nodes = 2 + keep_drain_timeout_nodes = false + selector = jsonencode({ + nodeSelectorTerms = [{ + matchExpressions = [ + { + key = "scheduling.cast.ai/spot" + operator = "Exists" + } + ] + }] + }) + execution_conditions { + enabled = true + achieved_savings_percentage = 10 + } + } +} + +resource "castai_rebalancing_job" "test" { + cluster_id = castai_eks_clusterid.cluster_id.id + rebalancing_schedule_id = castai_rebalancing_schedule.test.id + enabled = true +} diff --git a/examples/eks/eks_cluster_existing/providers.tf b/examples/eks/eks_cluster_existing/providers.tf new file mode 100644 index 00000000..7978c5ff --- /dev/null +++ b/examples/eks/eks_cluster_existing/providers.tf @@ -0,0 +1,15 @@ +# Following providers required by EKS and VPC modules. +provider "aws" { + region = var.cluster_region +} + +provider "kubernetes" { + host = data.aws_eks_cluster.existing_cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.existing_cluster.certificate_authority.0.data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", var.cluster_name, "--region", var.cluster_region] + } +} diff --git a/examples/eks/eks_cluster_existing/variables.tf b/examples/eks/eks_cluster_existing/variables.tf new file mode 100644 index 00000000..d4e24172 --- /dev/null +++ b/examples/eks/eks_cluster_existing/variables.tf @@ -0,0 +1,55 @@ +# EKS module variables. +variable "cluster_name" { + type = string + description = "EKS cluster name in AWS account." +} + +variable "cluster_region" { + type = string + description = "AWS Region in which EKS cluster and supporting resources will be created." +} + +variable "vpc_id" { + type = string + description = "EKS cluster VPC ID" +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +# Variables required for connecting EKS cluster to CAST AI. +variable "castai_api_token" { + type = string + description = "CAST AI API token created in console.cast.ai API Access keys section" +} + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} + +variable "tags" { + type = map(any) + description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled." + default = {} +} + + +variable "cluster_security_group_id" { + type = string + description = "EKS cluster security group ID" +} + +variable "node_security_group_id" { + type = string + description = "EKS cluster node security group ID" +} + +variable "subnets" { + type = list(string) + description = "Subnet IDs used by CAST AI to provision nodes" +} diff --git a/examples/eks/eks_cluster_existing/versions.tf b/examples/eks/eks_cluster_existing/versions.tf new file mode 100644 index 00000000..70ee4499 --- /dev/null +++ b/examples/eks/eks_cluster_existing/versions.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + castai = { + source = "castai/castai" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + helm = { + source = "hashicorp/helm" + } + aws = { + source = "hashicorp/aws" + } + } + required_version = ">= 0.13" +} From d1e8ee068bd2532658cf0fef9defd32f669874e0 Mon Sep 17 00:00:00 2001 From: anthonyvg9 <124307868+anthonyvg9@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:27:24 -0700 Subject: [PATCH 2/4] scheduled rebalance updated --- examples/eks/eks_cluster_existing/castai.tf | 6 +++--- examples/eks/eks_cluster_existing/tf.vars.example | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 examples/eks/eks_cluster_existing/tf.vars.example diff --git a/examples/eks/eks_cluster_existing/castai.tf b/examples/eks/eks_cluster_existing/castai.tf index 39ae67a6..96f122e5 100644 --- a/examples/eks/eks_cluster_existing/castai.tf +++ b/examples/eks/eks_cluster_existing/castai.tf @@ -134,7 +134,7 @@ module "castai-eks-cluster" { depends_on = [module.castai-eks-role-iam] } -resource "castai_rebalancing_schedule" "test" { +resource "castai_rebalancing_schedule" "spots" { name = "rebalance spots at every 30th minute" schedule { cron = "*/30 * * * *" @@ -165,8 +165,8 @@ resource "castai_rebalancing_schedule" "test" { } } -resource "castai_rebalancing_job" "test" { +resource "castai_rebalancing_job" "spots" { cluster_id = castai_eks_clusterid.cluster_id.id - rebalancing_schedule_id = castai_rebalancing_schedule.test.id + rebalancing_schedule_id = castai_rebalancing_schedule.spots.id enabled = true } diff --git a/examples/eks/eks_cluster_existing/tf.vars.example b/examples/eks/eks_cluster_existing/tf.vars.example new file mode 100644 index 00000000..3f828d79 --- /dev/null +++ b/examples/eks/eks_cluster_existing/tf.vars.example @@ -0,0 +1,7 @@ +cluster_name = "" +cluster_region = "" +castai_api_token = "" +vpc_id = "" +cluster_security_group_id = "" +node_security_group_id = "" +subnets = ["", ""] \ No newline at end of file From 2c44d6eced29f14bc5b00fae062b6f774653f031 Mon Sep 17 00:00:00 2001 From: anthonyvg9 <124307868+anthonyvg9@users.noreply.github.com> Date: Wed, 18 Oct 2023 15:27:30 -0700 Subject: [PATCH 3/4] aks-existing-example --- examples/aks/aks_cluster_existing/README.MD | 21 +++ examples/aks/aks_cluster_existing/castai.tf | 133 +++++++++++++ .../aks/aks_cluster_existing/providers.tf | 8 + .../aks/aks_cluster_existing/tf.vars.example | 5 + .../aks/aks_cluster_existing/variables.tf | 44 +++++ examples/aks/aks_cluster_existing/versions.tf | 14 ++ examples/gke/gke_cluster_existing/castai.tf | 175 ++++++++++++++++++ .../gke/gke_cluster_existing/variables.tf | 49 +++++ examples/gke/gke_cluster_existing/version.tf | 17 ++ 9 files changed, 466 insertions(+) create mode 100644 examples/aks/aks_cluster_existing/README.MD create mode 100644 examples/aks/aks_cluster_existing/castai.tf create mode 100644 examples/aks/aks_cluster_existing/providers.tf create mode 100644 examples/aks/aks_cluster_existing/tf.vars.example create mode 100644 examples/aks/aks_cluster_existing/variables.tf create mode 100644 examples/aks/aks_cluster_existing/versions.tf create mode 100644 examples/gke/gke_cluster_existing/castai.tf create mode 100644 examples/gke/gke_cluster_existing/variables.tf create mode 100644 examples/gke/gke_cluster_existing/version.tf diff --git a/examples/aks/aks_cluster_existing/README.MD b/examples/aks/aks_cluster_existing/README.MD new file mode 100644 index 00000000..f3566224 --- /dev/null +++ b/examples/aks/aks_cluster_existing/README.MD @@ -0,0 +1,21 @@ +# Existing AKS cluster and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations +Following example shows how to onboard existing AKS cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/). + +Example configuration should be analysed in the following order: +1. Create CAST AI related resources to connect AKS cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf` + +# Usage +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your cluster name, cluster_rg, cluster region, subnets and CAST AI API token. +3. Initialize Terraform. Under example root folder run: +``` +terraform init +``` +4. Run Terraform apply: +``` +terraform apply -var-file=tf.vars +``` +5. To destroy resources created by this example: +``` +terraform destroy -var-file=tf.vars +``` diff --git a/examples/aks/aks_cluster_existing/castai.tf b/examples/aks/aks_cluster_existing/castai.tf new file mode 100644 index 00000000..984c2827 --- /dev/null +++ b/examples/aks/aks_cluster_existing/castai.tf @@ -0,0 +1,133 @@ +# Configure Data sources and providers required for CAST AI connection. +data "azurerm_subscription" "current" {} + +data "azurerm_kubernetes_cluster" "example" { + name = var.cluster_name + resource_group_name = var.cluster_rg +} + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +provider "helm" { + kubernetes { + host = data.azurerm_kubernetes_cluster.example.kube_config.0.host + client_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_certificate) + client_key = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.client_key) + cluster_ca_certificate = base64decode(data.azurerm_kubernetes_cluster.example.kube_config.0.cluster_ca_certificate) + } +} + +# Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module. +module "castai-aks-cluster" { + source = "castai/aks/castai" + + api_url = var.castai_api_url + castai_api_token = var.castai_api_token + wait_for_cluster_ready = true + + aks_cluster_name = var.cluster_name + aks_cluster_region = var.cluster_region + node_resource_group = data.azurerm_kubernetes_cluster.example.node_resource_group + resource_group = data.azurerm_kubernetes_cluster.example.resource_group_name + delete_nodes_on_disconnect = var.delete_nodes_on_disconnect + + subscription_id = data.azurerm_subscription.current.subscription_id + tenant_id = data.azurerm_subscription.current.tenant_id + + + default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + disk_cpu_ratio = 0 + subnets = var.subnets + tags = var.tags + max_pods_per_node = 60 + } + } + + node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + min_cpu = 8 + max_cpu = 96 + max_memory = 786432 + instance_families = { + exclude = ["standard_FSv2", "standard_Dv4"] + } + } + } + } + + // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. + // Here: + // - unschedulablePods - Unscheduled pods policy + // - nodeDownscaler - Node deletion policy + autoscaler_policies_json = <<-EOT + { + "enabled": true, + "unschedulablePods": { + "enabled": true + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": false, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 5, + "scopedMode": false + } + }, + "clusterLimits": { + "cpu": { + "maxCores": 100, + "minCores": 1 + }, + "enabled": false + } + } + EOT + +} + + +resource "castai_rebalancing_schedule" "default" { + name = "rebalance nodes at every 30th minute" + schedule { + cron = "CRON_TZ=America/Argentina/Buenos_Aires */30 * * * *" + } + trigger_conditions { + savings_percentage = 20 + } + launch_configuration { + # only consider instances older than 5 minutes + node_ttl_seconds = 300 + num_targeted_nodes = 3 + rebalancing_min_nodes = 2 + keep_drain_timeout_nodes = false + execution_conditions { + enabled = true + achieved_savings_percentage = 10 + } + } +} + +resource "castai_rebalancing_job" "default" { + cluster_id = module.castai-aks-cluster.cluster_id + rebalancing_schedule_id = castai_rebalancing_schedule.default.id + enabled = true +} diff --git a/examples/aks/aks_cluster_existing/providers.tf b/examples/aks/aks_cluster_existing/providers.tf new file mode 100644 index 00000000..bdab1922 --- /dev/null +++ b/examples/aks/aks_cluster_existing/providers.tf @@ -0,0 +1,8 @@ +# Following providers required by AKS and Vnet resources. +provider "azurerm" { + features {} +} + +provider "azuread" { + tenant_id = data.azurerm_subscription.current.tenant_id +} diff --git a/examples/aks/aks_cluster_existing/tf.vars.example b/examples/aks/aks_cluster_existing/tf.vars.example new file mode 100644 index 00000000..abedbc3a --- /dev/null +++ b/examples/aks/aks_cluster_existing/tf.vars.example @@ -0,0 +1,5 @@ +cluster_name = "" +cluster_rg = "" +cluster_region = "" +castai_api_token = "" +subnets = [""] \ No newline at end of file diff --git a/examples/aks/aks_cluster_existing/variables.tf b/examples/aks/aks_cluster_existing/variables.tf new file mode 100644 index 00000000..f0f68a8b --- /dev/null +++ b/examples/aks/aks_cluster_existing/variables.tf @@ -0,0 +1,44 @@ +# AKS cluster variables. +variable "cluster_name" { + type = string + description = "Name of the AKS cluster, resources will be created for." +} + +variable "cluster_rg" { + type = string + description = "Resource Group of the AKS cluster, resources will be created for." +} + +variable "cluster_region" { + type = string + description = "Region of the AKS cluster, resources will be created for." +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +# Variables required for connecting EKS cluster to CAST AI +variable "castai_api_token" { + type = string + description = "CAST AI API token created in console.cast.ai API Access keys section" +} + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} + +variable "tags" { + type = map(any) + description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled." + default = {} +} + +variable "subnets" { + type = list(string) + description = "Cluster subnets" +} \ No newline at end of file diff --git a/examples/aks/aks_cluster_existing/versions.tf b/examples/aks/aks_cluster_existing/versions.tf new file mode 100644 index 00000000..9c3f0a05 --- /dev/null +++ b/examples/aks/aks_cluster_existing/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + azuread = { + source = "hashicorp/azuread" + } + castai = { + source = "castai/castai" + } + } + required_version = ">= 0.13" +} diff --git a/examples/gke/gke_cluster_existing/castai.tf b/examples/gke/gke_cluster_existing/castai.tf new file mode 100644 index 00000000..46ee0061 --- /dev/null +++ b/examples/gke/gke_cluster_existing/castai.tf @@ -0,0 +1,175 @@ +# 3. Connect GKE cluster to CAST AI in read-only mode. + +# Configure Data sources and providers required for CAST AI connection. + +data "google_client_config" "default" {} + +data "google_secret_manager_secret_version" "cast_ai_services_dev_token" { + secret = "rouseservice-key" + project = var.project_id +} + +provider "castai" { + api_url = var.castai_api_url + api_token = data.google_secret_manager_secret_version.cast_ai_services_dev_token.secret_data +} + +data "google_container_cluster" "my_cluster" { + name = var.cluster_name + location = var.cluster_region + project = var.project_id +} + + + +provider "helm" { + kubernetes { + host = "https://${data.google_container_cluster.my_cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.my_cluster.master_auth.0.cluster_ca_certificate) + } +} + +# Configure GKE cluster connection using CAST AI gke-cluster module. +module "castai-gke-iam" { + source = "castai/gke-iam/castai" + + project_id = var.project_id + gke_cluster_name = var.cluster_name +} + +module "castai-gke-cluster" { + source = "castai/gke-cluster/castai" + + api_url = var.castai_api_url + castai_api_token = data.google_secret_manager_secret_version.cast_ai_services_dev_token.secret_data + wait_for_cluster_ready = true + + project_id = var.project_id + gke_cluster_name = var.cluster_name + gke_cluster_location = var.cluster_region + + gke_credentials = data.google_container_cluster.my_cluster.master_auth[0].client_certificate + delete_nodes_on_disconnect = var.delete_nodes_on_disconnect + + default_node_configuration = module.castai-gke-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + disk_cpu_ratio = 25 + subnets = var.subnets + tags = var.tags + } + + # # Commented out for POC + # test_node_config = { + # disk_cpu_ratio = 10 + # subnets = [module.vpc.subnets_ids[0]] + # tags = var.tags + # max_pods_per_node = 40 + # disk_type = "pd-ssd", + # network_tags = ["dev"] + # } + + } + # # Commented out for POC + # node_templates = { + # default_by_castai = { + # name = "default-by-castai" + # configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] + # is_default = true + # should_taint = false + + # constraints = { + # on_demand = true + # spot = true + # use_spot_fallbacks = true + + # enable_spot_diversity = false + # spot_diversity_price_increase_limit_percent = 20 + # } + # } + # spot_tmpl = { + # configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] + # should_taint = true + + # custom_labels = { + # custom-label-key-1 = "custom-label-value-1" + # custom-label-key-2 = "custom-label-value-2" + # } + + # custom_taints = [ + # { + # key = "custom-taint-key-1" + # value = "custom-taint-value-1" + # effect = "NoSchedule" + # }, + # { + # key = "custom-taint-key-2" + # value = "custom-taint-value-2" + # effect = "NoSchedule" + # } + # ] + + # constraints = { + # fallback_restore_rate_seconds = 1800 + # spot = true + # use_spot_fallbacks = true + # min_cpu = 4 + # max_cpu = 100 + # instance_families = { + # exclude = ["e2"] + # } + # compute_optimized = false + # storage_optimized = false + # } + + # custom_instances_enabled = true + # } + # } + + // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. + // Here: + // - unschedulablePods - Unscheduled pods policy + // - nodeDownscaler - Node deletion policy + + # # Commend oout for POC + autoscaler_policies_json = <<-EOT +{ + "enabled": false, + "unschedulablePods": { + "enabled": false + }, + "nodeDownscaler": { + "enabled": false, + "emptyNodes": { + "enabled": false + }, + "evictor": { + "aggressiveMode": false, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": false, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + }, + "clusterLimits": { + "cpu": { + "maxCores": 20, + "minCores": 1 + }, + "enabled": false + }, + "maxReclaimRate": 0, + "spotBackups": { + "enabled": false, + "spotBackupRestoreRateSeconds": 1800 + } +} + EOT + + // depends_on helps terraform with creating proper dependencies graph in case of resource creation and in this case destroy + // module "castai-gke-cluster" has to be destroyed before module "castai-gke-iam" and "module.gke" + depends_on = [data.google_container_cluster.my_cluster, module.castai-gke-iam] +} diff --git a/examples/gke/gke_cluster_existing/variables.tf b/examples/gke/gke_cluster_existing/variables.tf new file mode 100644 index 00000000..0a754e19 --- /dev/null +++ b/examples/gke/gke_cluster_existing/variables.tf @@ -0,0 +1,49 @@ +variable "cluster_name" { + type = string + description = "GKE cluster name in GCP project." + default = "gke-907-av" +} + +variable "cluster_region" { + type = string + description = "The region to create the cluster." + default = "us-central1" +} + +variable "cluster_zones" { + type = list(string) + description = "The zones to create the cluster." + default = ["us-central1-c"] +} + +variable "project_id" { + type = string + description = "GCP project ID in which GKE cluster would be created." + default = "demos-321800" +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +# Variables required for connecting EKS cluster to CAST AI + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} + +variable "tags" { + type = map(any) + description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled." + default = {} +} + +variable "subnets" { + type = list(string) + description = "Cluster subnets" + default = ["projects/demos-321800/regions/us-central1/subnetworks/gke-907-av-ip-range-nodes"] +} \ No newline at end of file diff --git a/examples/gke/gke_cluster_existing/version.tf b/examples/gke/gke_cluster_existing/version.tf new file mode 100644 index 00000000..502f0c51 --- /dev/null +++ b/examples/gke/gke_cluster_existing/version.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + castai = { + source = "castai/castai" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + google = { + source = "hashicorp/google" + } + google-beta = { + source = "hashicorp/google-beta" + } + } + required_version = ">= 0.13" +} From 70600d6ca1ef29d860eb95515f777f4808904304 Mon Sep 17 00:00:00 2001 From: anthonyvg9 <124307868+anthonyvg9@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:44:23 -0700 Subject: [PATCH 4/4] gke-existing-example --- examples/gke/gke_cluster_existing/README.MD | 24 +++ examples/gke/gke_cluster_existing/castai.tf | 137 ++++++++---------- .../gke/gke_cluster_existing/tf.vars.example | 6 + .../gke/gke_cluster_existing/variables.tf | 10 +- 4 files changed, 99 insertions(+), 78 deletions(-) create mode 100644 examples/gke/gke_cluster_existing/README.MD create mode 100644 examples/gke/gke_cluster_existing/tf.vars.example diff --git a/examples/gke/gke_cluster_existing/README.MD b/examples/gke/gke_cluster_existing/README.MD new file mode 100644 index 00000000..30ece0eb --- /dev/null +++ b/examples/gke/gke_cluster_existing/README.MD @@ -0,0 +1,24 @@ +## Existing GKE and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations + +Following example shows how to onboard Existing GKE cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/). + +IAM policies required to connect the cluster to CAST AI in the example are created by [castai/gke-role-iam/castai module](https://github.com/castai/terraform-castai-gke-iam). + +Example configuration should be analysed in the following order: +1. Create IAM and other CAST AI related resources to connect GKE cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf` + +# Usage +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your project name, cluster name, cluster region, cluster zones, project_id, subnets, and CAST AI API token. +3. Initialize Terraform. Under example root folder run: +``` +terraform init +``` +4. Run Terraform apply: +``` +terraform apply -var-file=tf.vars +``` +5. To destroy resources created by this example: +``` +terraform destroy -var-file=tf.vars +``` diff --git a/examples/gke/gke_cluster_existing/castai.tf b/examples/gke/gke_cluster_existing/castai.tf index 46ee0061..1c83653b 100644 --- a/examples/gke/gke_cluster_existing/castai.tf +++ b/examples/gke/gke_cluster_existing/castai.tf @@ -4,14 +4,9 @@ data "google_client_config" "default" {} -data "google_secret_manager_secret_version" "cast_ai_services_dev_token" { - secret = "rouseservice-key" - project = var.project_id -} - provider "castai" { api_url = var.castai_api_url - api_token = data.google_secret_manager_secret_version.cast_ai_services_dev_token.secret_data + api_token = var.castai_api_token } data "google_container_cluster" "my_cluster" { @@ -21,7 +16,6 @@ data "google_container_cluster" "my_cluster" { } - provider "helm" { kubernetes { host = "https://${data.google_container_cluster.my_cluster.endpoint}" @@ -42,14 +36,14 @@ module "castai-gke-cluster" { source = "castai/gke-cluster/castai" api_url = var.castai_api_url - castai_api_token = data.google_secret_manager_secret_version.cast_ai_services_dev_token.secret_data + castai_api_token = var.castai_api_token wait_for_cluster_ready = true project_id = var.project_id gke_cluster_name = var.cluster_name gke_cluster_location = var.cluster_region - gke_credentials = data.google_container_cluster.my_cluster.master_auth[0].client_certificate + gke_credentials = module.castai-gke-iam.private_key delete_nodes_on_disconnect = var.delete_nodes_on_disconnect default_node_configuration = module.castai-gke-cluster.castai_node_configurations["default"] @@ -61,72 +55,69 @@ module "castai-gke-cluster" { tags = var.tags } - # # Commented out for POC - # test_node_config = { - # disk_cpu_ratio = 10 - # subnets = [module.vpc.subnets_ids[0]] - # tags = var.tags - # max_pods_per_node = 40 - # disk_type = "pd-ssd", - # network_tags = ["dev"] - # } + test_node_config = { + disk_cpu_ratio = 10 + subnets = var.subnets + tags = var.tags + max_pods_per_node = 40 + disk_type = "pd-ssd", + network_tags = ["dev"] + } } - # # Commented out for POC - # node_templates = { - # default_by_castai = { - # name = "default-by-castai" - # configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] - # is_default = true - # should_taint = false - - # constraints = { - # on_demand = true - # spot = true - # use_spot_fallbacks = true - - # enable_spot_diversity = false - # spot_diversity_price_increase_limit_percent = 20 - # } - # } - # spot_tmpl = { - # configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] - # should_taint = true - - # custom_labels = { - # custom-label-key-1 = "custom-label-value-1" - # custom-label-key-2 = "custom-label-value-2" - # } - - # custom_taints = [ - # { - # key = "custom-taint-key-1" - # value = "custom-taint-value-1" - # effect = "NoSchedule" - # }, - # { - # key = "custom-taint-key-2" - # value = "custom-taint-value-2" - # effect = "NoSchedule" - # } - # ] - - # constraints = { - # fallback_restore_rate_seconds = 1800 - # spot = true - # use_spot_fallbacks = true - # min_cpu = 4 - # max_cpu = 100 - # instance_families = { - # exclude = ["e2"] - # } - # compute_optimized = false - # storage_optimized = false - # } - - # custom_instances_enabled = true - # } - # } + node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + } + } + spot_tmpl = { + configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] + should_taint = true + + custom_labels = { + custom-label-key-1 = "custom-label-value-1" + custom-label-key-2 = "custom-label-value-2" + } + + custom_taints = [ + { + key = "custom-taint-key-1" + value = "custom-taint-value-1" + effect = "NoSchedule" + }, + { + key = "custom-taint-key-2" + value = "custom-taint-value-2" + effect = "NoSchedule" + } + ] + constraints = { + fallback_restore_rate_seconds = 1800 + spot = true + use_spot_fallbacks = true + min_cpu = 4 + max_cpu = 100 + instance_families = { + exclude = ["e2"] + } + compute_optimized = false + storage_optimized = false + } + + custom_instances_enabled = true + } + } // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. // Here: diff --git a/examples/gke/gke_cluster_existing/tf.vars.example b/examples/gke/gke_cluster_existing/tf.vars.example new file mode 100644 index 00000000..cf36dd21 --- /dev/null +++ b/examples/gke/gke_cluster_existing/tf.vars.example @@ -0,0 +1,6 @@ +cluster_name = "" +cluster_region = "" +cluster_zones = [""] +castai_api_token = "" +project_id = "" +subnets = [""] diff --git a/examples/gke/gke_cluster_existing/variables.tf b/examples/gke/gke_cluster_existing/variables.tf index 0a754e19..67341c11 100644 --- a/examples/gke/gke_cluster_existing/variables.tf +++ b/examples/gke/gke_cluster_existing/variables.tf @@ -1,25 +1,21 @@ variable "cluster_name" { type = string description = "GKE cluster name in GCP project." - default = "gke-907-av" } variable "cluster_region" { type = string description = "The region to create the cluster." - default = "us-central1" } variable "cluster_zones" { type = list(string) description = "The zones to create the cluster." - default = ["us-central1-c"] } variable "project_id" { type = string description = "GCP project ID in which GKE cluster would be created." - default = "demos-321800" } variable "castai_api_url" { @@ -28,6 +24,11 @@ variable "castai_api_url" { default = "https://api.cast.ai" } +variable "castai_api_token" { + type = string + description = "CAST AI API token created in console.cast.ai API Access keys section." +} + # Variables required for connecting EKS cluster to CAST AI variable "delete_nodes_on_disconnect" { @@ -45,5 +46,4 @@ variable "tags" { variable "subnets" { type = list(string) description = "Cluster subnets" - default = ["projects/demos-321800/regions/us-central1/subnetworks/gke-907-av-ip-range-nodes"] } \ No newline at end of file