feat: add autoscaler_policy_overrides support to module (#66)

castai · Jun 11, 2024 · cfda530 · cfda530
1 parent 16effbd
commit cfda530
Show file tree

Hide file tree

Showing 4 changed files with 288 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -83,6 +83,53 @@ module "castai-aks-cluster" {
       }
     }
   }
+
+  autoscaler_settings = {
+    enabled                                 = true
+    node_templates_partial_matching_enabled = false
+
+    unschedulable_pods = {
+      enabled = true
+
+      headroom = {
+        enabled           = true
+        cpu_percentage    = 10
+        memory_percentage = 10
+      }
+
+      headroom_spot = {
+        enabled           = true
+        cpu_percentage    = 10
+        memory_percentage = 10
+      }
+    }
+
+    node_downscaler = {
+      enabled = true
+
+      empty_nodes = {
+        enabled = true
+      }
+
+      evictor = {
+        aggressive_mode           = false
+        cycle_interval            = "5s10s"
+        dry_run                   = false
+        enabled                   = true
+        node_grace_period_minutes = 10
+        scoped_mode               = false
+      }
+    }
+
+    cluster_limits = {
+      enabled = true
+
+      cpu = {
+        max_cores = 20
+        min_cores = 1
+      }
+    }
+  }
 }
 ```
 
@@ -152,6 +199,88 @@ module "castai-aks-cluster" {
 }
 ```
 
+Migrating from 5.0.x to 5.2.x
+---------------------------
+
+Version 5.2.x changed:
+* Deprecated `autoscaler_policies_json` attribute. Use `autoscaler_settings` instead.
+
+Old configuration:
+```hcl
+module "castai-aks-cluster" {
+  autoscaler_policies_json = <<-EOT
+    {
+        "enabled": true,
+        "unschedulablePods": {
+            "enabled": true
+        },
+        "nodeDownscaler": {
+            "enabled": true,
+            "emptyNodes": {
+                "enabled": true
+            },
+            "evictor": {
+                "aggressiveMode": false,
+                "cycleInterval": "5m10s",
+                "dryRun": false,
+                "enabled": true,
+                "nodeGracePeriodMinutes": 10,
+                "scopedMode": false
+            }
+        },
+        "nodeTemplatesPartialMatchingEnabled": false,
+        "clusterLimits": {
+            "cpu": {
+                "maxCores": 20,
+                "minCores": 1
+            },
+            "enabled": true
+        }
+    }
+  EOT
+}
+```
+
+New configuration:
+```hcl
+module "castai-aks-cluster" {
+  autoscaler_settings = {
+    enabled                                 = true
+    node_templates_partial_matching_enabled = false
+
+    unschedulable_pods = {
+      enabled = true
+    }
+
+    node_downscaler = {
+      enabled = true
+
+      empty_nodes = {
+        enabled = true
+      }
+
+      evictor = {
+        aggressive_mode           = false
+        cycle_interval            = "5m10s"
+        dry_run                   = false
+        enabled                   = true
+        node_grace_period_minutes = 10
+        scoped_mode               = false
+      }
+    }
+
+    cluster_limits = {
+      enabled = true
+
+      cpu = {
+        max_cores = 20
+        min_cores = 1
+      }
+    }
+  }
+}
+```
+
 # Examples 
 
 Usage examples are located in [terraform provider repo](https://github.com/castai/terraform-provider-castai/tree/master/examples/aks)
@@ -164,18 +293,18 @@ Usage examples are located in [terraform provider repo](https://github.com/casta
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13   |
 | <a name="requirement_azuread"></a> [azuread](#requirement\_azuread) | >= 2.22.0 |
 | <a name="requirement_azurerm"></a> [azurerm](#requirement\_azurerm) | >= 3.7.0  |
-| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.0.0 |
+| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.4.0  |
 | <a name="requirement_helm"></a> [helm](#requirement\_helm) | >= 2.0.0  |
 
 ## Providers
 
-| Name | Version |
-|------|---------|
+| Name | Version   |
+|------|-----------|
 | <a name="provider_azuread"></a> [azuread](#provider\_azuread) | >= 2.22.0 |
-| <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | >= 3.7.0 |
-| <a name="provider_castai"></a> [castai](#provider\_castai) | ~> 7.0.0 |
-| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.0.0 |
-| <a name="provider_null"></a> [null](#provider\_null) | n/a |
+| <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | >= 3.7.0  |
+| <a name="provider_castai"></a> [castai](#provider\_castai) | ~> 7.4.0  |
+| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.0.0  |
+| <a name="provider_null"></a> [null](#provider\_null) | n/a       |
 
 ## Modules
 
@@ -199,9 +328,14 @@ No modules.
 | [castai_node_template.this](https://registry.terraform.io/providers/castai/castai/latest/docs/resources/node_template) | resource |
 | [helm_release.castai_agent](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [helm_release.castai_cluster_controller](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
+| [helm_release.castai_cluster_controller_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [helm_release.castai_evictor](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
+| [helm_release.castai_evictor_ext](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
+| [helm_release.castai_evictor_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [helm_release.castai_kvisor](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
+| [helm_release.castai_kvisor_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [helm_release.castai_pod_pinner](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
+| [helm_release.castai_pod_pinner_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [helm_release.castai_spot_handler](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
 | [null_resource.wait_for_cluster](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [azuread_client_config.current](https://registry.terraform.io/providers/hashicorp/azuread/latest/docs/data-sources/client_config) | data source |
@@ -217,14 +351,17 @@ No modules.
 | <a name="input_aks_cluster_region"></a> [aks\_cluster\_region](#input\_aks\_cluster\_region) | Region of the AKS cluster | `string` | n/a | yes |
 | <a name="input_api_grpc_addr"></a> [api\_grpc\_addr](#input\_api\_grpc\_addr) | CAST AI GRPC API address | `string` | `"api-grpc.cast.ai:443"` | no |
 | <a name="input_api_url"></a> [api\_url](#input\_api\_url) | URL of alternative CAST AI API to be used during development or testing | `string` | `"https://api.cast.ai"` | no |
-| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies | `string` | `null` | no |
+| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead. | `string` | `null` | no |
+| <a name="input_autoscaler_settings"></a> [autoscaler\_policy\_overrides](#input\_autoscaler\_policy\_overrides) | Optional Autoscaler policy definitions to override current autoscaler settings | `any` | `null` | no |
 | <a name="input_castai_api_token"></a> [castai\_api\_token](#input\_castai\_api\_token) | Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when `wait_for_cluster_ready` is set to true | `string` | `""` | no |
 | <a name="input_castai_components_labels"></a> [castai\_components\_labels](#input\_castai\_components\_labels) | Optional additional Kubernetes labels for CAST AI pods | `map(any)` | `{}` | no |
 | <a name="input_castai_components_sets"></a> [castai\_components\_sets](#input\_castai\_components\_sets) | Optional additional 'set' configurations for helm resources. | `map(string)` | `{}` | no |
 | <a name="input_cluster_controller_values"></a> [cluster\_controller\_values](#input\_cluster\_controller\_values) | List of YAML formatted string values for cluster-controller helm chart | `list(string)` | `[]` | no |
 | <a name="input_cluster_controller_version"></a> [cluster\_controller\_version](#input\_cluster\_controller\_version) | Version of castai-cluster-controller helm chart. If not provided, latest version will be used. | `string` | `null` | no |
 | <a name="input_default_node_configuration"></a> [default\_node\_configuration](#input\_default\_node\_configuration) | ID of the default node configuration | `string` | n/a | yes |
 | <a name="input_delete_nodes_on_disconnect"></a> [delete\_nodes\_on\_disconnect](#input\_delete\_nodes\_on\_disconnect) | Optionally delete Cast AI created nodes when the cluster is destroyed | `bool` | `false` | no |
+| <a name="input_evictor_ext_values"></a> [evictor\_ext\_values](#input\_evictor\_ext\_values) | List of YAML formatted string with evictor-ext values | `list(string)` | `[]` | no |
+| <a name="input_evictor_ext_version"></a> [evictor\_ext\_version](#input\_evictor\_ext\_version) | Version of castai-evictor-ext chart. Default latest | `string` | `null` | no |
 | <a name="input_evictor_values"></a> [evictor\_values](#input\_evictor\_values) | List of YAML formatted string values for evictor helm chart | `list(string)` | `[]` | no |
 | <a name="input_evictor_version"></a> [evictor\_version](#input\_evictor\_version) | Version of castai-evictor chart. If not provided, latest version will be used. | `string` | `null` | no |
 | <a name="input_grpc_url"></a> [grpc\_url](#input\_grpc\_url) | gRPC endpoint used by pod-pinner | `string` | `"grpc.cast.ai:443"` | no |
@@ -234,7 +371,9 @@ No modules.
 | <a name="input_node_configurations"></a> [node\_configurations](#input\_node\_configurations) | Map of AKS node configurations to create | `any` | `{}` | no |
 | <a name="input_node_resource_group"></a> [node\_resource\_group](#input\_node\_resource\_group) | n/a | `string` | n/a | yes |
 | <a name="input_node_templates"></a> [node\_templates](#input\_node\_templates) | Map of node templates to create | `any` | `{}` | no |
+| <a name="input_pod_pinner_version"></a> [pod\_pinner\_version](#input\_pod\_pinner\_version) | Version of pod-pinner helm chart. Default latest | `string` | `null` | no |
 | <a name="input_resource_group"></a> [resource\_group](#input\_resource\_group) | n/a | `string` | n/a | yes |
+| <a name="input_self_managed"></a> [self\_managed](#input\_self\_managed) | Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. | `bool` | `false` | no |
 | <a name="input_spot_handler_values"></a> [spot\_handler\_values](#input\_spot\_handler\_values) | List of YAML formatted string values for spot-handler helm chart | `list(string)` | `[]` | no |
 | <a name="input_spot_handler_version"></a> [spot\_handler\_version](#input\_spot\_handler\_version) | Version of castai-spot-handler helm chart. If not provided, latest version will be used. | `string` | `null` | no |
 | <a name="input_subscription_id"></a> [subscription\_id](#input\_subscription\_id) | Azure subscription ID | `string` | n/a | yes |

diff --git a/main.tf b/main.tf
@@ -730,8 +730,140 @@ resource "helm_release" "castai_kvisor_self_managed" {
 }
 
 resource "castai_autoscaler" "castai_autoscaler_policies" {
-  autoscaler_policies_json = var.autoscaler_policies_json
   cluster_id               = castai_aks_cluster.castai_cluster.id
 
+  // Deprecated  -- kept for backward compatibility
+  autoscaler_policies_json = var.autoscaler_policies_json
+
+  dynamic "autoscaler_settings" {
+    for_each = var.autoscaler_settings != null ? [var.autoscaler_settings] : []
+
+    content {
+      enabled                                 = try(autoscaler_settings.value.enabled, null)
+      is_scoped_mode                          = try(autoscaler_settings.value.is_scoped_mode, null)
+      node_templates_partial_matching_enabled = try(autoscaler_settings.value.node_templates_partial_matching_enabled, null)
+
+      dynamic "unschedulable_pods" {
+        for_each = try([autoscaler_settings.value.unschedulable_pods], [])
+
+        content {
+          enabled                  = try(unschedulable_pods.value.enabled, null)
+          custom_instances_enabled = try(unschedulable_pods.value.custom_instances_enabled, null)
+
+          dynamic "headroom" {
+            for_each = try([unschedulable_pods.value.headroom], [])
+
+            content {
+              enabled           = try(headroom.value.enabled, null)
+              cpu_percentage    = try(headroom.value.cpu_percentage, null)
+              memory_percentage = try(headroom.value.memory_percentage, null)
+            }
+          }
+
+          dynamic "headroom_spot" {
+            for_each = try([unschedulable_pods.value.headroom_spot], [])
+
+            content {
+              enabled           = try(headroom_spot.value.enabled, null)
+              cpu_percentage    = try(headroom_spot.value.cpu_percentage, null)
+              memory_percentage = try(headroom_spot.value.memory_percentage, null)
+            }
+          }
+
+          dynamic "node_constraints" {
+            for_each = try([unschedulable_pods.value.node_constraints], [])
+
+            content {
+              enabled       = try(node_constraints.value.enabled, null)
+              min_cpu_cores = try(node_constraints.value.min_cpu_cores, null)
+              max_cpu_cores = try(node_constraints.value.max_cpu_cores, null)
+              min_ram_mib   = try(node_constraints.value.min_ram_mib, null)
+              max_ram_mib   = try(node_constraints.value.max_ram_mib, null)
+            }
+          }
+        }
+      }
+
+      dynamic "cluster_limits" {
+        for_each = try([autoscaler_settings.value.cluster_limits], [])
+
+        content {
+          enabled = try(cluster_limits.value.enabled, null)
+
+
+          dynamic "cpu" {
+            for_each = try([cluster_limits.value.cpu], [])
+
+            content {
+              min_cores = try(cpu.value.min_cores, null)
+              max_cores = try(cpu.value.max_cores, null)
+            }
+          }
+        }
+      }
+
+      dynamic "spot_instances" {
+        for_each = try([autoscaler_settings.value.spot_instances], [])
+
+        content {
+          enabled                             = try(spot_instances.value.enabled, null)
+          max_reclaim_rate                    = try(spot_instances.value.max_reclaim_rate, null)
+          spot_diversity_enabled              = try(spot_instances.value.spot_diversity_enabled, null)
+          spot_diversity_price_increase_limit = try(spot_instances.value.spot_diversity_price_increase_limit, null)
+
+          dynamic "spot_backups" {
+            for_each = try([spot_instances.value.spot_backups], [])
+
+            content {
+              enabled                          = try(spot_backups.value.enabled, null)
+              spot_backup_restore_rate_seconds = try(spot_backups.value.spot_backup_restore_rate_seconds, null)
+            }
+          }
+
+          dynamic "spot_interruption_predictions" {
+            for_each = try([spot_instances.value.spot_interruption_predictions], [])
+
+            content {
+              enabled                            = try(spot_interruption_predictions.value.enabled, null)
+              spot_interruption_predictions_type = try(spot_interruption_predictions.value.spot_interruption_predictions_type, null)
+            }
+          }
+        }
+      }
+
+      dynamic "node_downscaler" {
+        for_each = try([autoscaler_settings.value.node_downscaler], [])
+
+        content {
+          enabled = try(node_downscaler.value.enabled, null)
+
+          dynamic "empty_nodes" {
+            for_each = try([node_downscaler.value.empty_nodes], [])
+
+            content {
+              enabled       = try(empty_nodes.value.enabled, null)
+              delay_seconds = try(empty_nodes.value.delay_seconds, null)
+            }
+          }
+
+          dynamic "evictor" {
+            for_each = try([node_downscaler.value.evictor], [])
+
+            content {
+              enabled                                = try(evictor.value.enabled, null)
+              dry_run                                = try(evictor.value.dry_run, null)
+              aggressive_mode                        = try(evictor.value.aggressive_mode, null)
+              scoped_mode                            = try(evictor.value.scoped_mode, null)
+              cycle_interval                         = try(evictor.value.cycle_interval, null)
+              node_grace_period_minutes              = try(evictor.value.node_grace_period_minutes, null)
+              pod_eviction_failure_back_off_interval = try(evictor.value.pod_eviction_failure_back_off_interval, null)
+              ignore_pod_disruption_budgets          = try(evictor.value.ignore_pod_disruption_budgets, null)
+            }
+          }
+        }
+      }
+    }
+  }
+
   depends_on = [helm_release.castai_agent, helm_release.castai_evictor]
 }
diff --git a/variables.tf b/variables.tf
@@ -40,7 +40,13 @@ variable "subscription_id" {
 
 variable "autoscaler_policies_json" {
   type        = string
-  description = "Optional json object to override CAST AI cluster autoscaler policies"
+  description = "Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead."
+  default     = null
+}
+
+variable "autoscaler_settings" {
+  type        = any
+  description = "Optional Autoscaler policy definitions to override current autoscaler settings"
   default     = null
 }
 

diff --git a/versions.tf b/versions.tf
@@ -12,7 +12,7 @@ terraform {
     }
     castai = {
       source  = "castai/castai"
-      version = "~> 7.0.0"
+      version = "~> 7.4.0"
     }
     helm = {
       source  = "hashicorp/helm"