diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf index ce605763ac..36e0b567d7 100644 --- a/terraform/azure/main.tf +++ b/terraform/azure/main.tf @@ -37,6 +37,7 @@ terraform { provider "azuread" { tenant_id = var.tenant_id } + provider "azurerm" { subscription_id = var.subscription_id features {} @@ -93,24 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { } } - # Core node-pool - default_node_pool { - # Unfortunately, changing anything about VM type / size recreates *whole cluster - name = "core" - vm_size = var.core_node_vm_size - os_disk_size_gb = 40 - enable_auto_scaling = true - min_count = 1 - max_count = 10 - vnet_subnet_id = azurerm_subnet.node_subnet.id - node_labels = { - "hub.jupyter.org/node-purpose" = "core", - "k8s.dask.org/node-purpose" = "core" - } - - orchestrator_version = var.kubernetes_version - } - auto_scaler_profile { skip_nodes_with_local_storage = true } @@ -120,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { } network_profile { - # I don't trust Azure CNI + # Azure CNI is the default, but we don't trust it to be reliable, so we've + # opted to use kubenet instead network_plugin = "kubenet" network_policy = "calico" } @@ -133,69 +117,92 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { client_secret = azuread_service_principal_password.service_principal_password[0].value } } -} + # default_node_pool must be set, and it must be a node pool of system type + # that can't scale to zero. Due to that we are forced to use it, and have + # decided to use it as our core node pool. + # + # Most changes to this node pool forces a replace operation on the entire + # cluster. This can be avoided with v3.47.0+ of this provider by declaring + # temporary_name_for_rotation = "coreb". + # + # ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628 + # ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation. + # + default_node_pool { + name = var.node_pools["core"][0].name + vm_size = var.node_pools["core"][0].vm_size + os_disk_size_gb = var.node_pools["core"][0].os_disk_size_gb + kubelet_disk_type = var.node_pools["core"][0].kubelet_disk_type + enable_auto_scaling = true + min_count = var.node_pools["core"][0].min + max_count = var.node_pools["core"][0].max + node_labels = merge({ + "hub.jupyter.org/node-purpose" = "core", + "k8s.dask.org/node-purpose" = "core" + }, var.node_pools["core"][0].labels) + node_taints = concat([], var.node_pools["core"][0].taints) -resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { - for_each = var.notebook_nodes + orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version) - name = "nb${each.key}" - kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id - enable_auto_scaling = true - os_disk_size_gb = 200 - vnet_subnet_id = azurerm_subnet.node_subnet.id + vnet_subnet_id = azurerm_subnet.node_subnet.id + } +} - orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version - vm_size = each.value.vm_size +resource "azurerm_kubernetes_cluster_node_pool" "user_pool" { + for_each = { for i, v in var.node_pools["user"] : v.name => v } + + name = each.value.name + vm_size = each.value.vm_size + os_disk_size_gb = each.value.os_disk_size_gb + kubelet_disk_type = each.value.kubelet_disk_type + enable_auto_scaling = true + min_count = each.value.min + max_count = each.value.max + node_labels = merge({ "hub.jupyter.org/node-purpose" = "user", "k8s.dask.org/node-purpose" = "scheduler" - "hub.jupyter.org/node-size" = each.value.vm_size }, each.value.labels) - node_taints = concat([ "hub.jupyter.org_dedicated=user:NoSchedule" ], each.value.taints) + orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version - min_count = each.value.min - max_count = each.value.max + kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id + vnet_subnet_id = azurerm_subnet.node_subnet.id } -resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" { - # If dask_nodes is set, we use that. If it isn't, we use notebook_nodes. - # This lets us set dask_nodes to an empty array to get no dask nodes - for_each = var.dask_nodes - name = "dask${each.key}" - kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id - enable_auto_scaling = true - os_disk_size_gb = 200 - vnet_subnet_id = azurerm_subnet.node_subnet.id +resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" { + for_each = { for i, v in var.node_pools["dask"] : v.name => v } - orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version + name = each.value.name + vm_size = each.value.vm_size + os_disk_size_gb = each.value.os_disk_size_gb + kubelet_disk_type = each.value.kubelet_disk_type + enable_auto_scaling = true + min_count = each.value.min + max_count = each.value.max - vm_size = each.value.vm_size node_labels = merge({ "k8s.dask.org/node-purpose" = "worker", - "hub.jupyter.org/node-size" = each.value.vm_size }, each.value.labels) - node_taints = concat([ "k8s.dask.org_dedicated=worker:NoSchedule" ], each.value.taints) + orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version - min_count = each.value.min - max_count = each.value.max + kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id + vnet_subnet_id = azurerm_subnet.node_subnet.id } -# AZure container registry resource "azurerm_container_registry" "container_registry" { - # meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE name = var.global_container_registry_name resource_group_name = azurerm_resource_group.jupyterhub.name location = azurerm_resource_group.jupyterhub.location @@ -203,6 +210,7 @@ resource "azurerm_container_registry" "container_registry" { admin_enabled = true } + locals { registry_creds = { "imagePullSecret" = { diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars index eaf3d01c87..6a552efa24 100644 --- a/terraform/azure/projects/utoronto.tfvars +++ b/terraform/azure/projects/utoronto.tfvars @@ -1,3 +1,11 @@ +# IMPORTANT: Due to a restrictive network rule from storage.tf, we can't perform +# "terraform plan" or "terraform apply" without a workaround. +# +# One known workaround is to allow your public IP temporarily as +# discussed in https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422. +# This workaround is problematic as that may temporarily allow access +# to storage by other actors with the same IP. +# tenant_id = "78aac226-2f03-4b4d-9037-b46d56c55210" subscription_id = "ead3521a-d994-4a44-a68d-b16e35642d5b" resourcegroup_name = "2i2c-utoronto-cluster" @@ -8,21 +16,44 @@ location = "canadacentral" storage_size = 8192 ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana" -# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions -# available via: az aks get-versions --location westus2 -o table -# -kubernetes_version = "1.26.3" - -# FIXME: upgrade core_node_vm_size to Standard_E4s_v5 -core_node_vm_size = "Standard_E4s_v3" - -notebook_nodes = { - "default" : { - # NOTE: min-max below was set to 0-86 retroactively to align with - # observed state without understanding on why 0-86 was picked. - min : 0, - max : 86, - # FIXME: upgrade user nodes vm_size to Standard_E8s_v5 - vm_size : "Standard_E8s_v3", - } +# List available versions via: az aks get-versions --location westus2 -o table +kubernetes_version = "1.28.3" + +node_pools = { + core : [ + { + name : "core", + + # FIXME: Transition to "Standard_E2s_v5" nodes as they are large enough to + # for the biggest workload (prometheus-server) and can handle high + # availability requirements better. + # + # We are currently forced to handle three calico-typha pods that + # can't schedule on the same node, see https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632. + # + vm_size : "Standard_E4s_v3", + + # core nodes doesn't need much disk space + os_disk_size_gb : 40, + + # FIXME: Stop using persistent disks for the nodes, use the variable default + # "Temporary" instead by removing this line. + kubelet_disk_type : "OS", + + min : 1, + max : 10, + }, + ], + + user : [ + { + name : "usere8sv5", + vm_size : "Standard_E8s_v5", + os_disk_size_gb : 200, + min : 0, + max : 100, + }, + ], + + dask : [], } diff --git a/terraform/azure/variables.tf b/terraform/azure/variables.tf index b9cd943680..85856ea2e3 100644 --- a/terraform/azure/variables.tf +++ b/terraform/azure/variables.tf @@ -48,20 +48,6 @@ variable "kubernetes_version" { } -variable "core_node_vm_size" { - type = string - description = <<-EOT - VM Size to use for core nodes - - Core nodes will always be on, and count as 'base cost' - for a cluster. We should try to run with as few of them - as possible. - - WARNING: CHANGING THIS WILL DESTROY AND RECREATE THE CLUSTER! - EOT -} - - variable "global_container_registry_name" { type = string description = <<-EOT @@ -92,30 +78,44 @@ variable "ssh_pub_key" { EOT } -variable "notebook_nodes" { - type = map(object({ - min : number, - max : number, - vm_size : string, - labels : optional(map(string), {}), - taints : optional(list(string), []), - kubernetes_version : optional(string, "") - })) - description = "Notebook node pools to create" - default = {} -} +variable "node_pools" { + type = map( + list( + object({ + name : string, + vm_size : string, + os_disk_size_gb : optional(number, 100), + kubelet_disk_type : optional(string, "Temporary"), + min : number, + max : number, + labels : optional(map(string), {}), + taints : optional(list(string), []), + kubernetes_version : optional(string, ""), + }) + ) + ) + description = <<-EOT + Node pools to create to be listed under the keys 'core', 'user', and 'dask'. + + There should be exactly one core node pool. The core node pool is given a + special treatment by being listed directly in the cluster resource's + 'default_node_pool' field. + EOT + + validation { + condition = length(var.node_pools["core"]) == 1 + error_message = "The core node pool is mapped to the cluster resource's `default_node_pool`, due to this we require exactly one core node pool to be specified." + } + + validation { + condition = length(setsubtract(keys(var.node_pools), ["core", "user", "dask"])) == 0 + error_message = "Only three kinds of node pools supported: 'core', 'user', and 'dask'." + } -variable "dask_nodes" { - type = map(object({ - min : number, - max : number, - vm_size : string, - labels : optional(map(string), {}), - taints : optional(list(string), []), - kubernetes_version : optional(string, "") - })) - description = "Dask node pools to create" - default = {} + validation { + condition = length(setintersection(keys(var.node_pools), ["core", "user", "dask"])) == 3 + error_message = "All three kinds of node pools ('core', 'user', and 'dask') must be declared, even if they are empty lists of node pools." + } } variable "create_service_principal" {