From 5d51d84a0f6052a18d1c387b1968af50278df414 Mon Sep 17 00:00:00 2001 From: Phil <91220442+CastAIPhil@users.noreply.github.com> Date: Fri, 20 Oct 2023 11:52:52 -0400 Subject: [PATCH] Aks arm template (#235) * Add R/O Onboarding example for AKS * remove unused variables * run make generate-sdk * update sdk * add ARM template for onboarding clusters in AKS --------- Co-authored-by: Phil Andrews Co-authored-by: Phil Andrews --- .../aks/aks_cluster_arm_template/README.MD | 23 ++ examples/aks/aks_cluster_arm_template/aks.tf | 25 +++ .../aks/aks_cluster_arm_template/castai.tf | 152 +++++++++++++ .../castai_arm_template.json | 210 ++++++++++++++++++ .../aks/aks_cluster_arm_template/providers.tf | 8 + .../aks_cluster_arm_template/tf.vars.example | 3 + .../aks/aks_cluster_arm_template/variables.tf | 40 ++++ .../aks/aks_cluster_arm_template/versions.tf | 14 ++ examples/aks/aks_cluster_arm_template/vnet.tf | 20 ++ 9 files changed, 495 insertions(+) create mode 100644 examples/aks/aks_cluster_arm_template/README.MD create mode 100644 examples/aks/aks_cluster_arm_template/aks.tf create mode 100644 examples/aks/aks_cluster_arm_template/castai.tf create mode 100644 examples/aks/aks_cluster_arm_template/castai_arm_template.json create mode 100644 examples/aks/aks_cluster_arm_template/providers.tf create mode 100644 examples/aks/aks_cluster_arm_template/tf.vars.example create mode 100644 examples/aks/aks_cluster_arm_template/variables.tf create mode 100644 examples/aks/aks_cluster_arm_template/versions.tf create mode 100644 examples/aks/aks_cluster_arm_template/vnet.tf diff --git a/examples/aks/aks_cluster_arm_template/README.MD b/examples/aks/aks_cluster_arm_template/README.MD new file mode 100644 index 00000000..b59044fb --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/README.MD @@ -0,0 +1,23 @@ +# AKS and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations +Following example shows how to onboard AKS cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/). + +Example configuration should be analysed in the following order: +1. Create Virtual network - `vnet.tf` +2. Create AKS cluster - `aks.tf` +3. Create CAST AI related resources to connect AKS cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf` + +# Usage +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your cluster name, cluster region and CAST AI API token. +3. Initialize Terraform. Under example root folder run: +``` +terraform init +``` +4. Run Terraform apply: +``` +terraform apply -var-file=tf.vars +``` +5. To destroy resources created by this example: +``` +terraform destroy -var-file=tf.vars +``` diff --git a/examples/aks/aks_cluster_arm_template/aks.tf b/examples/aks/aks_cluster_arm_template/aks.tf new file mode 100644 index 00000000..254476e7 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/aks.tf @@ -0,0 +1,25 @@ +# 2. Create AKS cluster. + +resource "azurerm_kubernetes_cluster" "this" { + name = var.cluster_name + resource_group_name = azurerm_resource_group.this.name + location = azurerm_resource_group.this.location + dns_prefix = var.cluster_name + node_resource_group = "${var.cluster_name}-ng" + + default_node_pool { + name = "default" + # Node count has to be > 2 to successfully deploy CAST AI controller. + node_count = 2 + vm_size = "Standard_D2_v2" + vnet_subnet_id = azurerm_subnet.internal.id + } + + identity { + type = "SystemAssigned" + } + + tags = { + Environment = "Test" + } +} diff --git a/examples/aks/aks_cluster_arm_template/castai.tf b/examples/aks/aks_cluster_arm_template/castai.tf new file mode 100644 index 00000000..ea485597 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/castai.tf @@ -0,0 +1,152 @@ +# 3. Connect AKS cluster to CAST AI in READ-ONLY mode. + +# Configure Data sources and providers required for CAST AI connection. +data "azurerm_subscription" "current" {} + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +provider "helm" { + kubernetes { + host = azurerm_kubernetes_cluster.this.kube_config.0.host + client_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_certificate) + client_key = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_key) + cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.cluster_ca_certificate) + } +} + +# Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module. +module "castai-aks-cluster" { + source = "castai/aks/castai" + + api_url = var.castai_api_url + castai_api_token = var.castai_api_token + wait_for_cluster_ready = true + + aks_cluster_name = var.cluster_name + aks_cluster_region = var.cluster_region + node_resource_group = azurerm_kubernetes_cluster.this.node_resource_group + resource_group = azurerm_kubernetes_cluster.this.resource_group_name + + delete_nodes_on_disconnect = var.delete_nodes_on_disconnect + + subscription_id = data.azurerm_subscription.current.subscription_id + tenant_id = data.azurerm_subscription.current.tenant_id + + default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + disk_cpu_ratio = 25 + subnets = [azurerm_subnet.internal.id] + tags = var.tags + } + + test_node_config = { + disk_cpu_ratio = 25 + subnets = [azurerm_subnet.internal.id] + tags = var.tags + max_pods_per_node = 40 + } + } + + node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + is_default = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + } + } + spot_tmpl = { + configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] + should_taint = true + + custom_labels = { + custom-label-key-1 = "custom-label-value-1" + custom-label-key-2 = "custom-label-value-2" + } + + custom_taints = [ + { + key = "custom-taint-key-1" + value = "custom-taint-value-1" + }, + { + key = "custom-taint-key-2" + value = "custom-taint-value-2" + } + ] + + constraints = { + fallback_restore_rate_seconds = 1800 + spot = true + use_spot_fallbacks = true + min_cpu = 4 + max_cpu = 100 + instance_families = { + exclude = ["standard_DPLSv5"] + } + compute_optimized = false + storage_optimized = false + } + } + } + + // Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. + // Here: + // - unschedulablePods - Unscheduled pods policy + // - nodeDownscaler - Node deletion policy + autoscaler_policies_json = <<-EOT + { + "enabled": true, + "unschedulablePods": { + "enabled": true + }, + "nodeDownscaler": { + "enabled": true, + "emptyNodes": { + "enabled": true + }, + "evictor": { + "aggressiveMode": false, + "cycleInterval": "5m10s", + "dryRun": false, + "enabled": true, + "nodeGracePeriodMinutes": 10, + "scopedMode": false + } + }, + "clusterLimits": { + "cpu": { + "maxCores": 20, + "minCores": 1 + }, + "enabled": true + } + } + EOT + +} + +resource "azurerm_resource_group_template_deployment" "castai_onboarding_arm" { + name = var.cluster_name + resource_group_name = azurerm_kubernetes_cluster.this.resource_group_name + deployment_mode = "Incremental" + template_content = file("castai_arm_template.json",) + parameters_content = jsonencode({ + "apiKey" = {value = var.castai_api_token } + "clusterResourceName" = {value = var.cluster_name} + "location" = {value = var.cluster_region} + }) +} diff --git a/examples/aks/aks_cluster_arm_template/castai_arm_template.json b/examples/aks/aks_cluster_arm_template/castai_arm_template.json new file mode 100644 index 00000000..2eecbee9 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/castai_arm_template.json @@ -0,0 +1,210 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "outputs": {}, + "parameters": { + "apiKey": { + "type": "String" + }, + "clusterResourceName": { + "metadata": { + "description": "The name of the Managed Cluster resource." + }, + "type": "String" + }, + "createNewCluster": { + "defaultValue": false, + "metadata": { + "description": "When set to 'true', creates new AKS cluster. Otherwise, an existing cluster is used." + }, + "type": "Bool" + }, + "dnsPrefix": { + "defaultValue": "[concat(parameters('clusterResourceName'),'-dns')]", + "metadata": { + "description": "Optional DNS prefix to use with hosted Kubernetes API server FQDN." + }, + "type": "String" + }, + "enableAzurePolicy": { + "defaultValue": false, + "metadata": { + "description": "Boolean flag to turn on and off Azure Policy addon." + }, + "type": "Bool" + }, + "enableHttpApplicationRouting": { + "defaultValue": true, + "metadata": { + "description": "Boolean flag to turn on and off http application routing." + }, + "type": "Bool" + }, + "enablePrivateCluster": { + "defaultValue": false, + "metadata": { + "description": "Enable private network access to the Kubernetes cluster." + }, + "type": "Bool" + }, + "enableRBAC": { + "defaultValue": true, + "metadata": { + "description": "Boolean flag to turn on and off of RBAC." + }, + "type": "Bool" + }, + "enableSecretStoreCSIDriver": { + "defaultValue": false, + "metadata": { + "description": "Boolean flag to turn on and off secret store CSI driver." + }, + "type": "Bool" + }, + "kubernetesVersion": { + "defaultValue": "1.25.6", + "metadata": { + "description": "The version of Kubernetes." + }, + "type": "String" + }, + "location": { + "metadata": { + "description": "The location of AKS resource." + }, + "type": "String" + }, + "networkPlugin": { + "allowedValues": [ + "azure", + "kubenet" + ], + "defaultValue": "kubenet", + "metadata": { + "description": "Network plugin used for building Kubernetes network." + }, + "type": "String" + }, + "osDiskSizeGB": { + "defaultValue": 0, + "maxValue": 1023, + "metadata": { + "description": "Disk size (in GiB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize." + }, + "minValue": 0, + "type": "Int" + }, + "vmCount": { + "defaultValue": 3, + "metadata": { + "description": "VM count" + }, + "type": "Int" + }, + "vmEnableAutoScale": { + "defaultValue": true, + "metadata": { + "description": "enable auto scaling" + }, + "type": "Bool" + }, + "vmSize": { + "defaultValue": "Standard_DS2_v2", + "metadata": { + "description": "VM size" + }, + "type": "String" + } + }, + "resources": [ + { + "apiVersion": "2022-11-01", + "condition": "[parameters('createNewCluster')]", + "dependsOn": [], + "identity": { + "type": "SystemAssigned" + }, + "location": "[parameters('location')]", + "name": "[parameters('clusterResourceName')]", + "properties": { + "addonProfiles": { + "azureKeyvaultSecretsProvider": { + "enabled": "[parameters('enableSecretStoreCSIDriver')]" + }, + "azurepolicy": { + "enabled": "[parameters('enableAzurePolicy')]" + }, + "httpApplicationRouting": { + "enabled": "[parameters('enableHttpApplicationRouting')]" + } + }, + "agentPoolProfiles": [ + { + "count": "[parameters('vmCount')]", + "enableAutoScaling": "[parameters('vmEnableAutoScale')]", + "enableNodePublicIP": false, + "maxCount": 10, + "maxPods": 110, + "minCount": 1, + "mode": "System", + "name": "agentpool", + "osDiskSizeGB": "[parameters('osDiskSizeGB')]", + "osType": "Linux", + "storageProfile": "ManagedDisks", + "tags": {}, + "type": "VirtualMachineScaleSets", + "vmSize": "[parameters('vmSize')]" + } + ], + "apiServerAccessProfile": { + "enablePrivateCluster": "[parameters('enablePrivateCluster')]" + }, + "dnsPrefix": "[parameters('dnsPrefix')]", + "enableRBAC": "[parameters('enableRBAC')]", + "kubernetesVersion": "[parameters('kubernetesVersion')]", + "networkProfile": { + "loadBalancerSku": "standard", + "networkPlugin": "[parameters('networkPlugin')]" + } + }, + "sku": { + "name": "Basic", + "tier": "Free" + }, + "tags": {}, + "type": "Microsoft.ContainerService/managedClusters" + }, + { + "apiVersion": "2022-11-01", + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters/', parameters('clusterResourceName'))]" + ], + "name": "[variables('extensionResourceName')]", + "plan": { + "name": "[variables('plan-name')]", + "product": "[variables('plan-offerID')]", + "publisher": "[variables('plan-publisher')]" + }, + "properties": { + "autoUpgradeMinorVersion": true, + "configurationSettings": { + "apiKey": "[parameters('apiKey')]", + "provider": "[variables('provider')]" + }, + "extensionType": "[variables('clusterExtensionTypeName')]", + "releaseTrain": "[variables('releaseTrain')]" + }, + "scope": "[concat('Microsoft.ContainerService/managedClusters/', parameters('clusterResourceName'))]", + "type": "Microsoft.KubernetesConfiguration/extensions" + } + ], + "variables": { + "clusterExtensionTypeName": "castai.agent.standard", + "extensionResourceName": "castai-agent", + "plan-name": "standard", + "plan-offerID": "castai-agent", + "plan-publisher": "castaigroupinc1683643265413", + "provider": "aks", + "releaseTrain": "stable" + } +} \ No newline at end of file diff --git a/examples/aks/aks_cluster_arm_template/providers.tf b/examples/aks/aks_cluster_arm_template/providers.tf new file mode 100644 index 00000000..bdab1922 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/providers.tf @@ -0,0 +1,8 @@ +# Following providers required by AKS and Vnet resources. +provider "azurerm" { + features {} +} + +provider "azuread" { + tenant_id = data.azurerm_subscription.current.tenant_id +} diff --git a/examples/aks/aks_cluster_arm_template/tf.vars.example b/examples/aks/aks_cluster_arm_template/tf.vars.example new file mode 100644 index 00000000..ad015b01 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/tf.vars.example @@ -0,0 +1,3 @@ +cluster_name = "" +cluster_region = "" +castai_api_token = "" diff --git a/examples/aks/aks_cluster_arm_template/variables.tf b/examples/aks/aks_cluster_arm_template/variables.tf new file mode 100644 index 00000000..6700d57b --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/variables.tf @@ -0,0 +1,40 @@ +# AKS cluster variables. +variable "cluster_name" { + type = string + description = "Name of the AKS cluster, resources will be created for." +} + +variable "cluster_region" { + type = string + description = "Region of the AKS cluster, resources will be created for." +} + +variable "cluster_version" { + type = string + description = "AKS cluster version." + default = "1.23" +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +# Variables required for connecting EKS cluster to CAST AI +variable "castai_api_token" { + type = string + description = "CAST AI API token created in console.cast.ai API Access keys section" +} + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} + +variable "tags" { + type = map(any) + description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled." + default = {} +} diff --git a/examples/aks/aks_cluster_arm_template/versions.tf b/examples/aks/aks_cluster_arm_template/versions.tf new file mode 100644 index 00000000..5106c2c1 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + azuread = { + source = "hashicorp/azuread" + } + castai = { + source = "castai/castai" + } + } + required_version = ">= 0.13" +} diff --git a/examples/aks/aks_cluster_arm_template/vnet.tf b/examples/aks/aks_cluster_arm_template/vnet.tf new file mode 100644 index 00000000..77677a71 --- /dev/null +++ b/examples/aks/aks_cluster_arm_template/vnet.tf @@ -0,0 +1,20 @@ +# 1. Create virtual network and resource group for the cluster. + +resource "azurerm_resource_group" "this" { + name = var.cluster_name + location = var.cluster_region +} + +resource "azurerm_virtual_network" "this" { + name = "${var.cluster_name}-network" + location = azurerm_resource_group.this.location + resource_group_name = azurerm_resource_group.this.name + address_space = ["10.1.0.0/16"] +} + +resource "azurerm_subnet" "internal" { + name = "internal" + virtual_network_name = azurerm_virtual_network.this.name + resource_group_name = azurerm_resource_group.this.name + address_prefixes = ["10.1.0.0/22"] +}