-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add ARM template for onboarding clusters in AKS
- Loading branch information
Phil Andrews
authored and
Phil Andrews
committed
Oct 19, 2023
1 parent
637d66f
commit 9ec7660
Showing
9 changed files
with
495 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# AKS and CAST AI example with CAST AI Autoscaler policies and additional Node Configurations | ||
Following example shows how to onboard AKS cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/). | ||
|
||
Example configuration should be analysed in the following order: | ||
1. Create Virtual network - `vnet.tf` | ||
2. Create AKS cluster - `aks.tf` | ||
3. Create CAST AI related resources to connect AKS cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf` | ||
|
||
# Usage | ||
1. Rename `tf.vars.example` to `tf.vars` | ||
2. Update `tf.vars` file with your cluster name, cluster region and CAST AI API token. | ||
3. Initialize Terraform. Under example root folder run: | ||
``` | ||
terraform init | ||
``` | ||
4. Run Terraform apply: | ||
``` | ||
terraform apply -var-file=tf.vars | ||
``` | ||
5. To destroy resources created by this example: | ||
``` | ||
terraform destroy -var-file=tf.vars | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# 2. Create AKS cluster. | ||
|
||
resource "azurerm_kubernetes_cluster" "this" { | ||
name = var.cluster_name | ||
resource_group_name = azurerm_resource_group.this.name | ||
location = azurerm_resource_group.this.location | ||
dns_prefix = var.cluster_name | ||
node_resource_group = "${var.cluster_name}-ng" | ||
|
||
default_node_pool { | ||
name = "default" | ||
# Node count has to be > 2 to successfully deploy CAST AI controller. | ||
node_count = 2 | ||
vm_size = "Standard_D2_v2" | ||
vnet_subnet_id = azurerm_subnet.internal.id | ||
} | ||
|
||
identity { | ||
type = "SystemAssigned" | ||
} | ||
|
||
tags = { | ||
Environment = "Test" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
# 3. Connect AKS cluster to CAST AI in READ-ONLY mode. | ||
|
||
# Configure Data sources and providers required for CAST AI connection. | ||
data "azurerm_subscription" "current" {} | ||
|
||
provider "castai" { | ||
api_url = var.castai_api_url | ||
api_token = var.castai_api_token | ||
} | ||
|
||
provider "helm" { | ||
kubernetes { | ||
host = azurerm_kubernetes_cluster.this.kube_config.0.host | ||
client_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_certificate) | ||
client_key = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.client_key) | ||
cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.this.kube_config.0.cluster_ca_certificate) | ||
} | ||
} | ||
|
||
# Configure AKS cluster connection to CAST AI using CAST AI aks-cluster module. | ||
module "castai-aks-cluster" { | ||
source = "castai/aks/castai" | ||
|
||
api_url = var.castai_api_url | ||
castai_api_token = var.castai_api_token | ||
wait_for_cluster_ready = true | ||
|
||
aks_cluster_name = var.cluster_name | ||
aks_cluster_region = var.cluster_region | ||
node_resource_group = azurerm_kubernetes_cluster.this.node_resource_group | ||
resource_group = azurerm_kubernetes_cluster.this.resource_group_name | ||
|
||
delete_nodes_on_disconnect = var.delete_nodes_on_disconnect | ||
|
||
subscription_id = data.azurerm_subscription.current.subscription_id | ||
tenant_id = data.azurerm_subscription.current.tenant_id | ||
|
||
default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"] | ||
|
||
node_configurations = { | ||
default = { | ||
disk_cpu_ratio = 25 | ||
subnets = [azurerm_subnet.internal.id] | ||
tags = var.tags | ||
} | ||
|
||
test_node_config = { | ||
disk_cpu_ratio = 25 | ||
subnets = [azurerm_subnet.internal.id] | ||
tags = var.tags | ||
max_pods_per_node = 40 | ||
} | ||
} | ||
|
||
node_templates = { | ||
default_by_castai = { | ||
name = "default-by-castai" | ||
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] | ||
is_default = true | ||
should_taint = false | ||
|
||
constraints = { | ||
on_demand = true | ||
spot = true | ||
use_spot_fallbacks = true | ||
|
||
enable_spot_diversity = false | ||
spot_diversity_price_increase_limit_percent = 20 | ||
} | ||
} | ||
spot_tmpl = { | ||
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"] | ||
should_taint = true | ||
|
||
custom_labels = { | ||
custom-label-key-1 = "custom-label-value-1" | ||
custom-label-key-2 = "custom-label-value-2" | ||
} | ||
|
||
custom_taints = [ | ||
{ | ||
key = "custom-taint-key-1" | ||
value = "custom-taint-value-1" | ||
}, | ||
{ | ||
key = "custom-taint-key-2" | ||
value = "custom-taint-value-2" | ||
} | ||
] | ||
|
||
constraints = { | ||
fallback_restore_rate_seconds = 1800 | ||
spot = true | ||
use_spot_fallbacks = true | ||
min_cpu = 4 | ||
max_cpu = 100 | ||
instance_families = { | ||
exclude = ["standard_DPLSv5"] | ||
} | ||
compute_optimized = false | ||
storage_optimized = false | ||
} | ||
} | ||
} | ||
|
||
// Configure Autoscaler policies as per API specification https://api.cast.ai/v1/spec/#/PoliciesAPI/PoliciesAPIUpsertClusterPolicies. | ||
// Here: | ||
// - unschedulablePods - Unscheduled pods policy | ||
// - nodeDownscaler - Node deletion policy | ||
autoscaler_policies_json = <<-EOT | ||
{ | ||
"enabled": true, | ||
"unschedulablePods": { | ||
"enabled": true | ||
}, | ||
"nodeDownscaler": { | ||
"enabled": true, | ||
"emptyNodes": { | ||
"enabled": true | ||
}, | ||
"evictor": { | ||
"aggressiveMode": false, | ||
"cycleInterval": "5m10s", | ||
"dryRun": false, | ||
"enabled": true, | ||
"nodeGracePeriodMinutes": 10, | ||
"scopedMode": false | ||
} | ||
}, | ||
"clusterLimits": { | ||
"cpu": { | ||
"maxCores": 20, | ||
"minCores": 1 | ||
}, | ||
"enabled": true | ||
} | ||
} | ||
EOT | ||
|
||
} | ||
|
||
resource "azurerm_resource_group_template_deployment" "castai_onboarding_arm" { | ||
name = var.cluster_name | ||
resource_group_name = azurerm_kubernetes_cluster.this.resource_group_name | ||
deployment_mode = "Incremental" | ||
template_content = file("castai_arm_template.json",) | ||
parameters_content = jsonencode({ | ||
"apiKey" = {value = var.castai_api_token } | ||
"clusterResourceName" = {value = var.cluster_name} | ||
"location" = {value = var.cluster_region} | ||
}) | ||
} |
Oops, something went wrong.