diff --git a/config/clusters/cloudbank/common.values.yaml b/config/clusters/cloudbank/common.values.yaml index ad9d644c60..12680f56e8 100644 --- a/config/clusters/cloudbank/common.values.yaml +++ b/config/clusters/cloudbank/common.values.yaml @@ -21,5 +21,5 @@ jupyterhub: name: quay.io/2i2c/cloudbank-data8-image tag: d2746e55a4ee nodeSelector: - # Put everything on the most appropriate nodepool for these users - cloud.google.com/gke-nodepool: nb-n2-highmem-4 + # Put everything on the most appropriate instance type for these users + node.kubernetes.io/instance-type: n2-highmem-4 diff --git a/config/clusters/leap/common.values.yaml b/config/clusters/leap/common.values.yaml index 1090165911..84f8fc275d 100644 --- a/config/clusters/leap/common.values.yaml +++ b/config/clusters/leap/common.values.yaml @@ -284,7 +284,8 @@ basehub: environment: NVIDIA_DRIVER_CAPABILITIES: compute,utility node_selector: - cloud.google.com/gke-nodepool: nb-gpu-t4 + node.kubernetes.io/instance-type: n1-standard-8 + cloud.google.com/gke-accelerator: nvidia-tesla-t4 mem_limit: 30G mem_guarantee: 24G extra_resource_limits: diff --git a/terraform/gcp/projects/awi-ciroh.tfvars b/terraform/gcp/projects/awi-ciroh.tfvars index 26bcc7f5eb..ead74af2ce 100644 --- a/terraform/gcp/projects/awi-ciroh.tfvars +++ b/terraform/gcp/projects/awi-ciroh.tfvars @@ -9,9 +9,9 @@ filestore_capacity_gb = 2048 k8s_versions = { min_master_version : "1.29.1-gke.1589018", - core_nodes_version : "1.27.4-gke.900", - notebook_nodes_version : "1.27.4-gke.900", - dask_nodes_version : "1.27.4-gke.900", + core_nodes_version : "1.29.1-gke.1589018", + notebook_nodes_version : "1.29.1-gke.1589018", + dask_nodes_version : "1.29.1-gke.1589018", } user_buckets = { @@ -31,7 +31,7 @@ user_buckets = { # Setup notebook node pools notebook_nodes = { - "n2-highmem-4" : { + "n2-highmem-4-b" : { min : 0, max : 100, machine_type : "n2-highmem-4", diff --git a/terraform/gcp/projects/cloudbank.tfvars b/terraform/gcp/projects/cloudbank.tfvars index af442ac19a..4f15bdc345 100644 --- a/terraform/gcp/projects/cloudbank.tfvars +++ b/terraform/gcp/projects/cloudbank.tfvars @@ -10,9 +10,9 @@ k8s_versions = { # upgrading the control plane, there will be ~5 minutes of k8s not being # available making new server launches error etc. min_master_version : "1.29.1-gke.1589018", - core_nodes_version : "1.27.5-gke.200", - notebook_nodes_version : "1.27.5-gke.200", - dask_nodes_version : "1.27.5-gke.200", + core_nodes_version : "1.29.1-gke.1589018", + notebook_nodes_version : "1.29.1-gke.1589018", + dask_nodes_version : "1.29.1-gke.1589018", } core_node_machine_type = "n2-highmem-2" @@ -22,10 +22,17 @@ enable_filestore = true filestore_capacity_gb = 1024 notebook_nodes = { + # FIXME: tainted, to be deleted when empty, replaced by k8s upgraded variant "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", + node_version : "1.27.5-gke.200", + }, + "n2-highmem-4-b" : { + min : 0, + max : 100, + machine_type : "n2-highmem-4", }, "n2-highmem-16" : { min : 0, @@ -36,7 +43,7 @@ notebook_nodes = { min : 0, max : 100, machine_type : "n2-highmem-64", - } + }, } # Setup a single node pool for dask workers. diff --git a/terraform/gcp/projects/leap.tfvars b/terraform/gcp/projects/leap.tfvars index 1dff1b8e81..c9ff66f82a 100644 --- a/terraform/gcp/projects/leap.tfvars +++ b/terraform/gcp/projects/leap.tfvars @@ -6,9 +6,9 @@ core_node_machine_type = "n2-highmem-4" k8s_versions = { min_master_version : "1.29.1-gke.1589018", - core_nodes_version : "1.27.4-gke.900", - notebook_nodes_version : "1.27.4-gke.900", - dask_nodes_version : "1.27.4-gke.900", + core_nodes_version : "1.29.1-gke.1589018", + notebook_nodes_version : "1.29.1-gke.1589018", + dask_nodes_version : "1.29.1-gke.1589018", } # GPUs not available in us-central1-b @@ -79,19 +79,49 @@ notebook_nodes = { max : 100, machine_type : "n2-highmem-4", }, + # FIXME: tainted, to be deleted when empty, replaced by k8s upgraded variant "n2-highmem-16" : { # A minimum of one is configured for LEAP to ensure quick startups at all # time. Cost is not a greater concern than optimizing startup times. min : 1, max : 100, machine_type : "n2-highmem-16", + node_version : "1.27.4-gke.900", + }, + "n2-highmem-16-b" : { + # A minimum of one is configured for LEAP to ensure quick startups at all + # time. Cost is not a greater concern than optimizing startup times. + min : 1, + max : 100, + machine_type : "n2-highmem-16", + node_version : "1.27.4-gke.900", }, "n2-highmem-64" : { min : 0, max : 100, machine_type : "n2-highmem-64" } + # FIXME: tainted, to be deleted when empty, replaced by k8s upgraded variant "gpu-t4" : { + min : 0, + max : 100, + machine_type : "n1-standard-8", + node_version : "1.27.4-gke.900", + gpu : { + enabled : true, + type : "nvidia-tesla-t4", + count : 1 + }, + zones : [ + # Get GPUs wherever they are available, as sometimes a single + # zone might be out of GPUs. + "us-central1-a", + "us-central1-b", + "us-central1-c", + "us-central1-f" + ] + }, + "gpu-t4-b" : { min : 0, max : 100, machine_type : "n1-standard-8", diff --git a/terraform/gcp/projects/qcl.tfvars b/terraform/gcp/projects/qcl.tfvars index 141a389d23..a80455af28 100644 --- a/terraform/gcp/projects/qcl.tfvars +++ b/terraform/gcp/projects/qcl.tfvars @@ -6,8 +6,8 @@ region = "europe-west1" k8s_versions = { min_master_version : "1.29.1-gke.1589018", - core_nodes_version : "1.27.4-gke.900", - notebook_nodes_version : "1.27.4-gke.900", + core_nodes_version : "1.29.1-gke.1589018", + notebook_nodes_version : "1.29.1-gke.1589018", } core_node_machine_type = "n2-highmem-2" @@ -26,10 +26,17 @@ user_buckets = { } notebook_nodes = { + # FIXME: tainted, to be deleted when empty, replaced by k8s upgraded variant "n2-highmem-4" : { min : 0, max : 100, machine_type : "n2-highmem-4", + node_version : "1.27.4-gke.900", + }, + "n2-highmem-4-b" : { + min : 0, + max : 100, + machine_type : "n2-highmem-4", }, "n2-highmem-16" : { min : 0, @@ -56,11 +63,18 @@ notebook_nodes = { max : 100, machine_type : "n2-highcpu-32", }, + # FIXME: tainted, to be deleted when empty, replaced by k8s upgraded variant "n2-highcpu-96" : { min : 0, max : 100, machine_type : "n2-highcpu-96", - } + node_version : "1.27.4-gke.900", + }, + "n2-highcpu-96-b" : { + min : 0, + max : 100, + machine_type : "n2-highcpu-96", + }, } hub_cloud_permissions = {