Skip to content

Commit

Permalink
Merge pull request #3596 from consideRatio/pr/utor-maint
Browse files Browse the repository at this point in the history
terraform, azure and utoronto: an upgrade, misc to support it, and misc opportunistic details
  • Loading branch information
consideRatio authored Jan 12, 2024
2 parents 9bb637a + d0ad3a2 commit 479b04b
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 104 deletions.
108 changes: 58 additions & 50 deletions terraform/azure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ terraform {
provider "azuread" {
tenant_id = var.tenant_id
}

provider "azurerm" {
subscription_id = var.subscription_id
features {}
Expand Down Expand Up @@ -93,24 +94,6 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}
}

# Core node-pool
default_node_pool {
# Unfortunately, changing anything about VM type / size recreates *whole cluster
name = "core"
vm_size = var.core_node_vm_size
os_disk_size_gb = 40
enable_auto_scaling = true
min_count = 1
max_count = 10
vnet_subnet_id = azurerm_subnet.node_subnet.id
node_labels = {
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}

orchestrator_version = var.kubernetes_version
}

auto_scaler_profile {
skip_nodes_with_local_storage = true
}
Expand All @@ -120,7 +103,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
}

network_profile {
# I don't trust Azure CNI
# Azure CNI is the default, but we don't trust it to be reliable, so we've
# opted to use kubenet instead
network_plugin = "kubenet"
network_policy = "calico"
}
Expand All @@ -133,76 +117,100 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" {
client_secret = azuread_service_principal_password.service_principal_password[0].value
}
}
}

# default_node_pool must be set, and it must be a node pool of system type
# that can't scale to zero. Due to that we are forced to use it, and have
# decided to use it as our core node pool.
#
# Most changes to this node pool forces a replace operation on the entire
# cluster. This can be avoided with v3.47.0+ of this provider by declaring
# temporary_name_for_rotation = "coreb".
#
# ref: https://github.com/hashicorp/terraform-provider-azurerm/pull/20628
# ref: https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/kubernetes_cluster#temporary_name_for_rotation.
#
default_node_pool {
name = var.node_pools["core"][0].name
vm_size = var.node_pools["core"][0].vm_size
os_disk_size_gb = var.node_pools["core"][0].os_disk_size_gb
kubelet_disk_type = var.node_pools["core"][0].kubelet_disk_type
enable_auto_scaling = true
min_count = var.node_pools["core"][0].min
max_count = var.node_pools["core"][0].max

node_labels = merge({
"hub.jupyter.org/node-purpose" = "core",
"k8s.dask.org/node-purpose" = "core"
}, var.node_pools["core"][0].labels)
node_taints = concat([], var.node_pools["core"][0].taints)

resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = var.notebook_nodes
orchestrator_version = coalesce(var.node_pools["core"][0].kubernetes_version, var.kubernetes_version)

name = "nb${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
vnet_subnet_id = azurerm_subnet.node_subnet.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}
}

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

vm_size = each.value.vm_size
resource "azurerm_kubernetes_cluster_node_pool" "user_pool" {
for_each = { for i, v in var.node_pools["user"] : v.name => v }

name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

node_labels = merge({
"hub.jupyter.org/node-purpose" = "user",
"k8s.dask.org/node-purpose" = "scheduler"
"hub.jupyter.org/node-size" = each.value.vm_size
}, each.value.labels)

node_taints = concat([
"hub.jupyter.org_dedicated=user:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
# If dask_nodes is set, we use that. If it isn't, we use notebook_nodes.
# This lets us set dask_nodes to an empty array to get no dask nodes
for_each = var.dask_nodes

name = "dask${each.key}"
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
enable_auto_scaling = true
os_disk_size_gb = 200
vnet_subnet_id = azurerm_subnet.node_subnet.id
resource "azurerm_kubernetes_cluster_node_pool" "dask_pool" {
for_each = { for i, v in var.node_pools["dask"] : v.name => v }

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version
name = each.value.name
vm_size = each.value.vm_size
os_disk_size_gb = each.value.os_disk_size_gb
kubelet_disk_type = each.value.kubelet_disk_type
enable_auto_scaling = true
min_count = each.value.min
max_count = each.value.max

vm_size = each.value.vm_size
node_labels = merge({
"k8s.dask.org/node-purpose" = "worker",
"hub.jupyter.org/node-size" = each.value.vm_size
}, each.value.labels)

node_taints = concat([
"k8s.dask.org_dedicated=worker:NoSchedule"
], each.value.taints)

orchestrator_version = each.value.kubernetes_version == "" ? var.kubernetes_version : each.value.kubernetes_version

min_count = each.value.min
max_count = each.value.max
kubernetes_cluster_id = azurerm_kubernetes_cluster.jupyterhub.id
vnet_subnet_id = azurerm_subnet.node_subnet.id
}

# AZure container registry

resource "azurerm_container_registry" "container_registry" {
# meh, only alphanumberic chars. No separators. BE CONSISTENT, AZURE
name = var.global_container_registry_name
resource_group_name = azurerm_resource_group.jupyterhub.name
location = azurerm_resource_group.jupyterhub.location
sku = "Premium"
admin_enabled = true
}


locals {
registry_creds = {
"imagePullSecret" = {
Expand Down
65 changes: 48 additions & 17 deletions terraform/azure/projects/utoronto.tfvars
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# IMPORTANT: Due to a restrictive network rule from storage.tf, we can't perform
# "terraform plan" or "terraform apply" without a workaround.
#
# One known workaround is to allow your public IP temporarily as
# discussed in https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422.
# This workaround is problematic as that may temporarily allow access
# to storage by other actors with the same IP.
#
tenant_id = "78aac226-2f03-4b4d-9037-b46d56c55210"
subscription_id = "ead3521a-d994-4a44-a68d-b16e35642d5b"
resourcegroup_name = "2i2c-utoronto-cluster"
Expand All @@ -8,21 +16,44 @@ location = "canadacentral"
storage_size = 8192
ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana"

# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions
# available via: az aks get-versions --location westus2 -o table
#
kubernetes_version = "1.26.3"

# FIXME: upgrade core_node_vm_size to Standard_E4s_v5
core_node_vm_size = "Standard_E4s_v3"

notebook_nodes = {
"default" : {
# NOTE: min-max below was set to 0-86 retroactively to align with
# observed state without understanding on why 0-86 was picked.
min : 0,
max : 86,
# FIXME: upgrade user nodes vm_size to Standard_E8s_v5
vm_size : "Standard_E8s_v3",
}
# List available versions via: az aks get-versions --location westus2 -o table
kubernetes_version = "1.28.3"

node_pools = {
core : [
{
name : "core",

# FIXME: Transition to "Standard_E2s_v5" nodes as they are large enough to
# for the biggest workload (prometheus-server) and can handle high
# availability requirements better.
#
# We are currently forced to handle three calico-typha pods that
# can't schedule on the same node, see https://github.com/2i2c-org/infrastructure/issues/3592#issuecomment-1883269632.
#
vm_size : "Standard_E4s_v3",

# core nodes doesn't need much disk space
os_disk_size_gb : 40,

# FIXME: Stop using persistent disks for the nodes, use the variable default
# "Temporary" instead by removing this line.
kubelet_disk_type : "OS",

min : 1,
max : 10,
},
],

user : [
{
name : "usere8sv5",
vm_size : "Standard_E8s_v5",
os_disk_size_gb : 200,
min : 0,
max : 100,
},
],

dask : [],
}
74 changes: 37 additions & 37 deletions terraform/azure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,6 @@ variable "kubernetes_version" {
}


variable "core_node_vm_size" {
type = string
description = <<-EOT
VM Size to use for core nodes
Core nodes will always be on, and count as 'base cost'
for a cluster. We should try to run with as few of them
as possible.
WARNING: CHANGING THIS WILL DESTROY AND RECREATE THE CLUSTER!
EOT
}


variable "global_container_registry_name" {
type = string
description = <<-EOT
Expand Down Expand Up @@ -92,30 +78,44 @@ variable "ssh_pub_key" {
EOT
}

variable "notebook_nodes" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
}))
description = "Notebook node pools to create"
default = {}
}
variable "node_pools" {
type = map(
list(
object({
name : string,
vm_size : string,
os_disk_size_gb : optional(number, 100),
kubelet_disk_type : optional(string, "Temporary"),
min : number,
max : number,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, ""),
})
)
)
description = <<-EOT
Node pools to create to be listed under the keys 'core', 'user', and 'dask'.
There should be exactly one core node pool. The core node pool is given a
special treatment by being listed directly in the cluster resource's
'default_node_pool' field.
EOT

validation {
condition = length(var.node_pools["core"]) == 1
error_message = "The core node pool is mapped to the cluster resource's `default_node_pool`, due to this we require exactly one core node pool to be specified."
}

validation {
condition = length(setsubtract(keys(var.node_pools), ["core", "user", "dask"])) == 0
error_message = "Only three kinds of node pools supported: 'core', 'user', and 'dask'."
}

variable "dask_nodes" {
type = map(object({
min : number,
max : number,
vm_size : string,
labels : optional(map(string), {}),
taints : optional(list(string), []),
kubernetes_version : optional(string, "")
}))
description = "Dask node pools to create"
default = {}
validation {
condition = length(setintersection(keys(var.node_pools), ["core", "user", "dask"])) == 3
error_message = "All three kinds of node pools ('core', 'user', and 'dask') must be declared, even if they are empty lists of node pools."
}
}

variable "create_service_principal" {
Expand Down

0 comments on commit 479b04b

Please sign in to comment.