From 572f4352bec8d8969481c474134d9bfff4ae3714 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 16:36:18 +0100 Subject: [PATCH 01/11] docs: fix notes about AKS machine types to use --- docs/topic/infrastructure/cluster-design.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/topic/infrastructure/cluster-design.md b/docs/topic/infrastructure/cluster-design.md index 39f77ccc72..089cc63c5f 100644 --- a/docs/topic/infrastructure/cluster-design.md +++ b/docs/topic/infrastructure/cluster-design.md @@ -122,9 +122,9 @@ The three machine types based on the cloud provider are the following: - r5.4xlarge - r5.16xlarge - [AKS](https://learn.microsoft.com/en-us/azure/virtual-machines/eav4-easv4-series) - - Standard_E4a_v4 - - Standard_E16_v4 - - Standard_E64_v4 + - Standard_E4s_v5 + - Standard_E16s_v5 + - Standard_E64s_v5 ## Network Policy From 8563929fb45c6b5ad694898a294ffff39303b4f9 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 16:37:25 +0100 Subject: [PATCH 02/11] Add docstring to terraform/azure/proxycommand.py --- terraform/azure/proxycommand.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/terraform/azure/proxycommand.py b/terraform/azure/proxycommand.py index 4d2b11ca37..a4cea7175b 100755 --- a/terraform/azure/proxycommand.py +++ b/terraform/azure/proxycommand.py @@ -1,4 +1,11 @@ #!/usr/bin/env python3 +""" +This script can be used to migrate Azure Files storage from one cluster to +another. + +Learn more at https://infrastructure.2i2c.org/hub-deployment-guide/hubs/other-hub-ops/move-hubs/across-clusters/#azure-files. +""" + import subprocess import sys import time From 0ee4273ad9d63f4de4add595740fd20de2a6810c Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 16:38:02 +0100 Subject: [PATCH 03/11] terraform, utoronto: reorganize for readability and add fixme notes --- terraform/azure/projects/utoronto.tfvars | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars index 2e360a0dcd..cd60187005 100644 --- a/terraform/azure/projects/utoronto.tfvars +++ b/terraform/azure/projects/utoronto.tfvars @@ -1,23 +1,26 @@ -tenant_id = "78aac226-2f03-4b4d-9037-b46d56c55210" -subscription_id = "ead3521a-d994-4a44-a68d-b16e35642d5b" -resourcegroup_name = "2i2c-utoronto-cluster" - - -kubernetes_version = "1.26.3" +tenant_id = "78aac226-2f03-4b4d-9037-b46d56c55210" +subscription_id = "ead3521a-d994-4a44-a68d-b16e35642d5b" +resourcegroup_name = "2i2c-utoronto-cluster" +global_container_registry_name = "2i2cutorontohubregistry" +global_storage_account_name = "2i2cutorontohubstorage" +location = "canadacentral" storage_size = 8192 +ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana" -ssh_pub_key = "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDQJ4h39UYNi1wybxAH+jCFkNK2aqRcuhDkQSMx0Hak5xkbt3KnT3cOwAgUP1Vt/SjhltSTuxpOHxiAKCRnjwRk60SxKhUNzPHih2nkfYTmBBjmLfdepDPSke/E0VWvTDIEXz/L8vW8aI0QGPXnXyqzEDO9+U1buheBlxB0diFAD3vEp2SqBOw+z7UgrGxXPdP+2b3AV+X6sOtd6uSzpV8Qvdh+QAkd4r7h9JrkFvkrUzNFAGMjlTb0Lz7qAlo4ynjEwzVN2I1i7cVDKgsGz9ZG/8yZfXXx+INr9jYtYogNZ63ajKR/dfjNPovydhuz5zQvQyxpokJNsTqt1CiWEUNj georgiana@georgiana" - -global_container_registry_name = "2i2cutorontohubregistry" -global_storage_account_name = "2i2cutorontohubstorage" +# FIXME: upgrade to 1.27.7, and then 1.28.3, based on the latest versions +# available via: az aks get-versions --location westus2 -o table +# +kubernetes_version = "1.26.3" -location = "canadacentral" +# FIXME: upgrade core_node_vm_size to Standard_E4s_v5 core_node_vm_size = "Standard_E4s_v3" + notebook_nodes = { "default" : { min : 1, max : 100, + # FIXME: upgrade user nodes vm_size to Standard_E8s_v5 vm_size : "Standard_E8s_v3", } } From f008c7c8cb876426f5374c418303a4b50a529f8f Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 17:52:18 +0100 Subject: [PATCH 04/11] terraform, utoronto: align files with current state --- terraform/azure/projects/utoronto.tfvars | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars index cd60187005..dacffb9cd6 100644 --- a/terraform/azure/projects/utoronto.tfvars +++ b/terraform/azure/projects/utoronto.tfvars @@ -18,8 +18,8 @@ core_node_vm_size = "Standard_E4s_v3" notebook_nodes = { "default" : { - min : 1, - max : 100, + min : 0, + max : 86, # FIXME: upgrade user nodes vm_size to Standard_E8s_v5 vm_size : "Standard_E8s_v3", } From dc54258bbe78264a2db3752dd49bdc37e8f473b6 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 17:53:40 +0100 Subject: [PATCH 05/11] terraform, azure: complete earlier cleanup by hardcoding NFS over SMB --- terraform/azure/storage.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/azure/storage.tf b/terraform/azure/storage.tf index cd2d0eedaf..335fca2ae0 100644 --- a/terraform/azure/storage.tf +++ b/terraform/azure/storage.tf @@ -23,7 +23,7 @@ resource "azurerm_storage_share" "homes" { name = "homes" storage_account_name = azurerm_storage_account.homes.name quota = var.storage_size - enabled_protocol = var.storage_protocol + enabled_protocol = "NFS" lifecycle { # Additional safeguard against deleting the share # as this causes irreversible data loss! From 593ed03d8c75ec93a8101025a0e97f4e397d5681 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 17:55:43 +0100 Subject: [PATCH 06/11] terraform, azure: stop declaring core node pool node count --- terraform/azure/main.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf index 8ada0eb218..d21147e71f 100644 --- a/terraform/azure/main.tf +++ b/terraform/azure/main.tf @@ -92,9 +92,8 @@ resource "azurerm_kubernetes_cluster" "jupyterhub" { # Core node-pool default_node_pool { - name = "core" - node_count = 1 # Unfortunately, changing anything about VM type / size recreates *whole cluster + name = "core" vm_size = var.core_node_vm_size os_disk_size_gb = 40 enable_auto_scaling = true From ce6084a8f45e4b20a2dc0f753c75abd204a123d7 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 17:56:35 +0100 Subject: [PATCH 07/11] terraform, azure: fix capitalization --- terraform/azure/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf index d21147e71f..b002bd005f 100644 --- a/terraform/azure/main.tf +++ b/terraform/azure/main.tf @@ -196,7 +196,7 @@ resource "azurerm_container_registry" "container_registry" { name = var.global_container_registry_name resource_group_name = azurerm_resource_group.jupyterhub.name location = azurerm_resource_group.jupyterhub.location - sku = "premium" + sku = "Premium" admin_enabled = true } From 2898b4396a4ce08ed78323a7e8a32e5a941b7560 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 18:32:29 +0100 Subject: [PATCH 08/11] terraform, azure: bump lower bound on some providers --- terraform/azure/main.tf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/terraform/azure/main.tf b/terraform/azure/main.tf index b002bd005f..ce605763ac 100644 --- a/terraform/azure/main.tf +++ b/terraform/azure/main.tf @@ -8,6 +8,9 @@ terraform { # FIXME: v3 has been released and we are still at v2, see release notes: # https://github.com/hashicorp/terraform-provider-azurerm/releases/tag/v3.0.0 # + # We may need to remove old state and then then import it according to + # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/3.0-upgrade-guide#migrating-to-new--renamed-resources. + # source = "hashicorp/azurerm" version = "~> 2.99" } @@ -15,13 +18,13 @@ terraform { azuread = { # ref: https://registry.terraform.io/providers/hashicorp/azuread/latest source = "hashicorp/azuread" - version = "~> 2.35" + version = "~> 2.47" } kubernetes = { # ref: https://registry.terraform.io/providers/hashicorp/kubernetes/latest source = "hashicorp/kubernetes" - version = "~> 2.18" + version = "~> 2.25" } } From a6436985a7720f82e671f90ad98d6693ad3e7a63 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 18:32:52 +0100 Subject: [PATCH 09/11] terraform, azure: cleanup unused azure-file k8s namespace --- terraform/azure/storage.tf | 6 ------ 1 file changed, 6 deletions(-) diff --git a/terraform/azure/storage.tf b/terraform/azure/storage.tf index 335fca2ae0..7f6b43cedb 100644 --- a/terraform/azure/storage.tf +++ b/terraform/azure/storage.tf @@ -34,9 +34,3 @@ resource "azurerm_storage_share" "homes" { output "azure_fileshare_url" { value = azurerm_storage_share.homes.url } - -resource "kubernetes_namespace" "homes" { - metadata { - name = "azure-file" - } -} From c8bb78dbf29fea7d017d86119d45000a38d42c31 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 19:14:06 +0100 Subject: [PATCH 10/11] terraform, utoronto: add comment about min-max nodes --- terraform/azure/projects/utoronto.tfvars | 2 ++ 1 file changed, 2 insertions(+) diff --git a/terraform/azure/projects/utoronto.tfvars b/terraform/azure/projects/utoronto.tfvars index dacffb9cd6..eaf3d01c87 100644 --- a/terraform/azure/projects/utoronto.tfvars +++ b/terraform/azure/projects/utoronto.tfvars @@ -18,6 +18,8 @@ core_node_vm_size = "Standard_E4s_v3" notebook_nodes = { "default" : { + # NOTE: min-max below was set to 0-86 retroactively to align with + # observed state without understanding on why 0-86 was picked. min : 0, max : 86, # FIXME: upgrade user nodes vm_size to Standard_E8s_v5 From d9edf6b4472bd4549c2deed328ef346eb7289bb9 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Fri, 5 Jan 2024 19:16:34 +0100 Subject: [PATCH 11/11] terraform, azure: add note on handling a 403 when using terraform --- terraform/azure/storage.tf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/terraform/azure/storage.tf b/terraform/azure/storage.tf index 7f6b43cedb..2fc97b7ea9 100644 --- a/terraform/azure/storage.tf +++ b/terraform/azure/storage.tf @@ -12,6 +12,12 @@ resource "azurerm_storage_account" "homes" { network_rules { # Allow NFS access only from our nodes, deny access from all other networks + # + # Use of terraform plan or apply can run into issues due to this, but they + # can be handled by temporarily adding your public IP to a firewall + # exception like described in + # https://github.com/2i2c-org/infrastructure/issues/890#issuecomment-1879072422. + # default_action = "Deny" virtual_network_subnet_ids = [ azurerm_subnet.node_subnet.id