Skip to content

Commit

Permalink
Aks arm template (#235)
Browse files Browse the repository at this point in the history
* Add R/O Onboarding example for AKS

* remove unused variables

* run make generate-sdk

* update sdk

* add ARM template for onboarding clusters in AKS

---------

Co-authored-by: Phil Andrews <[email protected]>
Co-authored-by: Phil Andrews <[email protected]>
  • Loading branch information
3 people authored Oct 20, 2023
1 parent 4eb102d commit 5d51d84
Show file tree
Hide file tree
Showing 9 changed files with 495 additions and 0 deletions.
23 changes: 23 additions & 0 deletions examples/aks/aks_cluster_arm_template/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# AKS and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations
Following example shows how to onboard AKS cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/).

Example configuration should be analysed in the following order:
1. Create Virtual network - `vnet.tf`
2. Create AKS cluster - `aks.tf`
3. Create CAST AI related resources to connect AKS cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf`

# Usage
1. Rename `tf.vars.example` to `tf.vars`
2. Update `tf.vars` file with your cluster name, cluster region and CAST AI API token.
3. Initialize Terraform. Under example root folder run:
```
terraform init
```
4. Run Terraform apply:
```
terraform apply -var-file=tf.vars
```
5. To destroy resources created by this example:
```
terraform destroy -var-file=tf.vars
```
25 changes: 25 additions & 0 deletions examples/aks/aks_cluster_arm_template/aks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 2. Create AKS cluster.

resource "azurerm_kubernetes_cluster" "this" {
name = var.cluster_name
resource_group_name = azurerm_resource_group.this.name
location = azurerm_resource_group.this.location
dns_prefix = var.cluster_name
node_resource_group = "${var.cluster_name}-ng"

default_node_pool {
name = "default"
# Node count has to be > 2 to successfully deploy CAST AI controller.
node_count = 2
vm_size = "Standard_D2_v2"
vnet_subnet_id = azurerm_subnet.internal.id
}

identity {
type = "SystemAssigned"
}

tags = {
Environment = "Test"
}
}
152 changes: 152 additions & 0 deletions examples/aks/aks_cluster_arm_template/castai.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# 3. Connect AKS cluster to CAST AI in READ-ONLY mode.

# Configure Data sources and providers required for CAST AI connection.
data "azurerm_subscription" "current" {}

provider "castai" {
api_url = var.castai_api_url
api_token = var.castai_api_token
}

provider "helm" {
kubernetes {
host = azurerm_kubernetes_cluster.this.kube_config.0.host
client_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_certificate)
client_key = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_key)
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.cluster_ca_certificate)
}
}

# Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module.
module "castai-aks-cluster" {
source = "castai/aks/castai"

api_url = var.castai_api_url
castai_api_token = var.castai_api_token
wait_for_cluster_ready = true

aks_cluster_name = var.cluster_name
aks_cluster_region = var.cluster_region
node_resource_group = azurerm_kubernetes_cluster.this.node_resource_group
resource_group = azurerm_kubernetes_cluster.this.resource_group_name

delete_nodes_on_disconnect = var.delete_nodes_on_disconnect

subscription_id = data.azurerm_subscription.current.subscription_id
tenant_id = data.azurerm_subscription.current.tenant_id

default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"]

node_configurations = {
default = {
disk_cpu_ratio = 25
subnets = [azurerm_subnet.internal.id]
tags = var.tags
}

test_node_config = {
disk_cpu_ratio = 25
subnets = [azurerm_subnet.internal.id]
tags = var.tags
max_pods_per_node = 40
}
}

node_templates = {
default_by_castai = {
name = "default-by-castai"
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"]
is_default = true
should_taint = false

constraints = {
on_demand = true
spot = true
use_spot_fallbacks = true

enable_spot_diversity = false
spot_diversity_price_increase_limit_percent = 20
}
}
spot_tmpl = {
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"]
should_taint = true

custom_labels = {
custom-label-key-1 = "custom-label-value-1"
custom-label-key-2 = "custom-label-value-2"
}

custom_taints = [
{
key = "custom-taint-key-1"
value = "custom-taint-value-1"
},
{
key = "custom-taint-key-2"
value = "custom-taint-value-2"
}
]

constraints = {
fallback_restore_rate_seconds = 1800
spot = true
use_spot_fallbacks = true
min_cpu = 4
max_cpu = 100
instance_families = {
exclude = ["standard_DPLSv5"]
}
compute_optimized = false
storage_optimized = false
}
}
}

// Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies.
// Here:
// - unschedulablePods - Unscheduled pods policy
// - nodeDownscaler - Node deletion policy
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT

}

resource "azurerm_resource_group_template_deployment" "castai_onboarding_arm" {
name = var.cluster_name
resource_group_name = azurerm_kubernetes_cluster.this.resource_group_name
deployment_mode = "Incremental"
template_content = file("castai_arm_template.json",)
parameters_content = jsonencode({
"apiKey" = {value = var.castai_api_token }
"clusterResourceName" = {value = var.cluster_name}
"location" = {value = var.cluster_region}
})
}
Loading

0 comments on commit 5d51d84

Please sign in to comment.