Skip to content

Commit

Permalink
feat: add autoscaler_policy_overrides support to module (#66)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikenorgate authored Jun 11, 2024
1 parent 16effbd commit cfda530
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 11 deletions.
155 changes: 147 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,53 @@ module "castai-aks-cluster" {
}
}
}
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
headroom_spot = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5s10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}
```

Expand Down Expand Up @@ -152,6 +199,88 @@ module "castai-aks-cluster" {
}
```

Migrating from 5.0.x to 5.2.x
---------------------------

Version 5.2.x changed:
* Deprecated `autoscaler_policies_json` attribute. Use `autoscaler_settings` instead.

Old configuration:
```hcl
module "castai-aks-cluster" {
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"nodeTemplatesPartialMatchingEnabled": false,
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT
}
```

New configuration:
```hcl
module "castai-aks-cluster" {
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}
```

# Examples

Usage examples are located in [terraform provider repo](https://github.com/castai/terraform-provider-castai/tree/master/examples/aks)
Expand All @@ -164,18 +293,18 @@ Usage examples are located in [terraform provider repo](https://github.com/casta
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13 |
| <a name="requirement_azuread"></a> [azuread](#requirement\_azuread) | >= 2.22.0 |
| <a name="requirement_azurerm"></a> [azurerm](#requirement\_azurerm) | >= 3.7.0 |
| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.0.0 |
| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.4.0 |
| <a name="requirement_helm"></a> [helm](#requirement\_helm) | >= 2.0.0 |

## Providers

| Name | Version |
|------|---------|
| Name | Version |
|------|-----------|
| <a name="provider_azuread"></a> [azuread](#provider\_azuread) | >= 2.22.0 |
| <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | >= 3.7.0 |
| <a name="provider_castai"></a> [castai](#provider\_castai) | ~> 7.0.0 |
| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.0.0 |
| <a name="provider_null"></a> [null](#provider\_null) | n/a |
| <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | >= 3.7.0 |
| <a name="provider_castai"></a> [castai](#provider\_castai) | ~> 7.4.0 |
| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.0.0 |
| <a name="provider_null"></a> [null](#provider\_null) | n/a |

## Modules

Expand All @@ -199,9 +328,14 @@ No modules.
| [castai_node_template.this](https://registry.terraform.io/providers/castai/castai/latest/docs/resources/node_template) | resource |
| [helm_release.castai_agent](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_cluster_controller](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_cluster_controller_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_evictor](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_evictor_ext](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_evictor_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_kvisor](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_kvisor_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_pod_pinner](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_pod_pinner_self_managed](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.castai_spot_handler](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [null_resource.wait_for_cluster](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [azuread_client_config.current](https://registry.terraform.io/providers/hashicorp/azuread/latest/docs/data-sources/client_config) | data source |
Expand All @@ -217,14 +351,17 @@ No modules.
| <a name="input_aks_cluster_region"></a> [aks\_cluster\_region](#input\_aks\_cluster\_region) | Region of the AKS cluster | `string` | n/a | yes |
| <a name="input_api_grpc_addr"></a> [api\_grpc\_addr](#input\_api\_grpc\_addr) | CAST AI GRPC API address | `string` | `"api-grpc.cast.ai:443"` | no |
| <a name="input_api_url"></a> [api\_url](#input\_api\_url) | URL of alternative CAST AI API to be used during development or testing | `string` | `"https://api.cast.ai"` | no |
| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies | `string` | `null` | no |
| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead. | `string` | `null` | no |
| <a name="input_autoscaler_settings"></a> [autoscaler\_policy\_overrides](#input\_autoscaler\_policy\_overrides) | Optional Autoscaler policy definitions to override current autoscaler settings | `any` | `null` | no |
| <a name="input_castai_api_token"></a> [castai\_api\_token](#input\_castai\_api\_token) | Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when `wait_for_cluster_ready` is set to true | `string` | `""` | no |
| <a name="input_castai_components_labels"></a> [castai\_components\_labels](#input\_castai\_components\_labels) | Optional additional Kubernetes labels for CAST AI pods | `map(any)` | `{}` | no |
| <a name="input_castai_components_sets"></a> [castai\_components\_sets](#input\_castai\_components\_sets) | Optional additional 'set' configurations for helm resources. | `map(string)` | `{}` | no |
| <a name="input_cluster_controller_values"></a> [cluster\_controller\_values](#input\_cluster\_controller\_values) | List of YAML formatted string values for cluster-controller helm chart | `list(string)` | `[]` | no |
| <a name="input_cluster_controller_version"></a> [cluster\_controller\_version](#input\_cluster\_controller\_version) | Version of castai-cluster-controller helm chart. If not provided, latest version will be used. | `string` | `null` | no |
| <a name="input_default_node_configuration"></a> [default\_node\_configuration](#input\_default\_node\_configuration) | ID of the default node configuration | `string` | n/a | yes |
| <a name="input_delete_nodes_on_disconnect"></a> [delete\_nodes\_on\_disconnect](#input\_delete\_nodes\_on\_disconnect) | Optionally delete Cast AI created nodes when the cluster is destroyed | `bool` | `false` | no |
| <a name="input_evictor_ext_values"></a> [evictor\_ext\_values](#input\_evictor\_ext\_values) | List of YAML formatted string with evictor-ext values | `list(string)` | `[]` | no |
| <a name="input_evictor_ext_version"></a> [evictor\_ext\_version](#input\_evictor\_ext\_version) | Version of castai-evictor-ext chart. Default latest | `string` | `null` | no |
| <a name="input_evictor_values"></a> [evictor\_values](#input\_evictor\_values) | List of YAML formatted string values for evictor helm chart | `list(string)` | `[]` | no |
| <a name="input_evictor_version"></a> [evictor\_version](#input\_evictor\_version) | Version of castai-evictor chart. If not provided, latest version will be used. | `string` | `null` | no |
| <a name="input_grpc_url"></a> [grpc\_url](#input\_grpc\_url) | gRPC endpoint used by pod-pinner | `string` | `"grpc.cast.ai:443"` | no |
Expand All @@ -234,7 +371,9 @@ No modules.
| <a name="input_node_configurations"></a> [node\_configurations](#input\_node\_configurations) | Map of AKS node configurations to create | `any` | `{}` | no |
| <a name="input_node_resource_group"></a> [node\_resource\_group](#input\_node\_resource\_group) | n/a | `string` | n/a | yes |
| <a name="input_node_templates"></a> [node\_templates](#input\_node\_templates) | Map of node templates to create | `any` | `{}` | no |
| <a name="input_pod_pinner_version"></a> [pod\_pinner\_version](#input\_pod\_pinner\_version) | Version of pod-pinner helm chart. Default latest | `string` | `null` | no |
| <a name="input_resource_group"></a> [resource\_group](#input\_resource\_group) | n/a | `string` | n/a | yes |
| <a name="input_self_managed"></a> [self\_managed](#input\_self\_managed) | Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. | `bool` | `false` | no |
| <a name="input_spot_handler_values"></a> [spot\_handler\_values](#input\_spot\_handler\_values) | List of YAML formatted string values for spot-handler helm chart | `list(string)` | `[]` | no |
| <a name="input_spot_handler_version"></a> [spot\_handler\_version](#input\_spot\_handler\_version) | Version of castai-spot-handler helm chart. If not provided, latest version will be used. | `string` | `null` | no |
| <a name="input_subscription_id"></a> [subscription\_id](#input\_subscription\_id) | Azure subscription ID | `string` | n/a | yes |
Expand Down
134 changes: 133 additions & 1 deletion main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -730,8 +730,140 @@ resource "helm_release" "castai_kvisor_self_managed" {
}

resource "castai_autoscaler" "castai_autoscaler_policies" {
autoscaler_policies_json = var.autoscaler_policies_json
cluster_id = castai_aks_cluster.castai_cluster.id

// Deprecated -- kept for backward compatibility
autoscaler_policies_json = var.autoscaler_policies_json

dynamic "autoscaler_settings" {
for_each = var.autoscaler_settings != null ? [var.autoscaler_settings] : []

content {
enabled = try(autoscaler_settings.value.enabled, null)
is_scoped_mode = try(autoscaler_settings.value.is_scoped_mode, null)
node_templates_partial_matching_enabled = try(autoscaler_settings.value.node_templates_partial_matching_enabled, null)

dynamic "unschedulable_pods" {
for_each = try([autoscaler_settings.value.unschedulable_pods], [])

content {
enabled = try(unschedulable_pods.value.enabled, null)
custom_instances_enabled = try(unschedulable_pods.value.custom_instances_enabled, null)

dynamic "headroom" {
for_each = try([unschedulable_pods.value.headroom], [])

content {
enabled = try(headroom.value.enabled, null)
cpu_percentage = try(headroom.value.cpu_percentage, null)
memory_percentage = try(headroom.value.memory_percentage, null)
}
}

dynamic "headroom_spot" {
for_each = try([unschedulable_pods.value.headroom_spot], [])

content {
enabled = try(headroom_spot.value.enabled, null)
cpu_percentage = try(headroom_spot.value.cpu_percentage, null)
memory_percentage = try(headroom_spot.value.memory_percentage, null)
}
}

dynamic "node_constraints" {
for_each = try([unschedulable_pods.value.node_constraints], [])

content {
enabled = try(node_constraints.value.enabled, null)
min_cpu_cores = try(node_constraints.value.min_cpu_cores, null)
max_cpu_cores = try(node_constraints.value.max_cpu_cores, null)
min_ram_mib = try(node_constraints.value.min_ram_mib, null)
max_ram_mib = try(node_constraints.value.max_ram_mib, null)
}
}
}
}

dynamic "cluster_limits" {
for_each = try([autoscaler_settings.value.cluster_limits], [])

content {
enabled = try(cluster_limits.value.enabled, null)


dynamic "cpu" {
for_each = try([cluster_limits.value.cpu], [])

content {
min_cores = try(cpu.value.min_cores, null)
max_cores = try(cpu.value.max_cores, null)
}
}
}
}

dynamic "spot_instances" {
for_each = try([autoscaler_settings.value.spot_instances], [])

content {
enabled = try(spot_instances.value.enabled, null)
max_reclaim_rate = try(spot_instances.value.max_reclaim_rate, null)
spot_diversity_enabled = try(spot_instances.value.spot_diversity_enabled, null)
spot_diversity_price_increase_limit = try(spot_instances.value.spot_diversity_price_increase_limit, null)

dynamic "spot_backups" {
for_each = try([spot_instances.value.spot_backups], [])

content {
enabled = try(spot_backups.value.enabled, null)
spot_backup_restore_rate_seconds = try(spot_backups.value.spot_backup_restore_rate_seconds, null)
}
}

dynamic "spot_interruption_predictions" {
for_each = try([spot_instances.value.spot_interruption_predictions], [])

content {
enabled = try(spot_interruption_predictions.value.enabled, null)
spot_interruption_predictions_type = try(spot_interruption_predictions.value.spot_interruption_predictions_type, null)
}
}
}
}

dynamic "node_downscaler" {
for_each = try([autoscaler_settings.value.node_downscaler], [])

content {
enabled = try(node_downscaler.value.enabled, null)

dynamic "empty_nodes" {
for_each = try([node_downscaler.value.empty_nodes], [])

content {
enabled = try(empty_nodes.value.enabled, null)
delay_seconds = try(empty_nodes.value.delay_seconds, null)
}
}

dynamic "evictor" {
for_each = try([node_downscaler.value.evictor], [])

content {
enabled = try(evictor.value.enabled, null)
dry_run = try(evictor.value.dry_run, null)
aggressive_mode = try(evictor.value.aggressive_mode, null)
scoped_mode = try(evictor.value.scoped_mode, null)
cycle_interval = try(evictor.value.cycle_interval, null)
node_grace_period_minutes = try(evictor.value.node_grace_period_minutes, null)
pod_eviction_failure_back_off_interval = try(evictor.value.pod_eviction_failure_back_off_interval, null)
ignore_pod_disruption_budgets = try(evictor.value.ignore_pod_disruption_budgets, null)
}
}
}
}
}
}

depends_on = [helm_release.castai_agent, helm_release.castai_evictor]
}
8 changes: 7 additions & 1 deletion variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@ variable "subscription_id" {

variable "autoscaler_policies_json" {
type = string
description = "Optional json object to override CAST AI cluster autoscaler policies"
description = "Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead."
default = null
}

variable "autoscaler_settings" {
type = any
description = "Optional Autoscaler policy definitions to override current autoscaler settings"
default = null
}

Expand Down
2 changes: 1 addition & 1 deletion versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ terraform {
}
castai = {
source = "castai/castai"
version = "~> 7.0.0"
version = "~> 7.4.0"
}
helm = {
source = "hashicorp/helm"
Expand Down

0 comments on commit cfda530

Please sign in to comment.