From 152bc83a9439fa9cbe1c70d729b33174830afdac Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:10:05 +0200 Subject: [PATCH 1/9] 2i2c-aws-us, researchdelight: fix gpu option config --- config/clusters/2i2c-aws-us/common.values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/clusters/2i2c-aws-us/common.values.yaml b/config/clusters/2i2c-aws-us/common.values.yaml index 0519a6e277..3c1bdf6223 100644 --- a/config/clusters/2i2c-aws-us/common.values.yaml +++ b/config/clusters/2i2c-aws-us/common.values.yaml @@ -186,17 +186,17 @@ basehub: display_name: Pangeo Tensorflow ML Notebook slug: "tensorflow" kubespawner_override: - node.kubernetes.io/instance-type: g4dn.xlarge image: "pangeo/ml-notebook:b9584f6" pytorch: display_name: Pangeo PyTorch ML Notebook default: true slug: "pytorch" kubespawner_override: - node.kubernetes.io/instance-type: g4dn.xlarge image: "pangeo/pytorch-notebook:b9584f6" kubespawner_override: mem_limit: null mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge extra_resource_limits: nvidia.com/gpu: "1" From 7bd16d283bd8d5fec227f938ebb5bb16e82da9d8 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:12:39 +0200 Subject: [PATCH 2/9] 2i2c-aws-us, researchdelight: remove gpu image choices from common values --- config/clusters/2i2c-aws-us/common.values.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/config/clusters/2i2c-aws-us/common.values.yaml b/config/clusters/2i2c-aws-us/common.values.yaml index 3c1bdf6223..6a16cbc9c4 100644 --- a/config/clusters/2i2c-aws-us/common.values.yaml +++ b/config/clusters/2i2c-aws-us/common.values.yaml @@ -178,21 +178,6 @@ basehub: - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs slug: gpu description: "Start a container on a dedicated node with a GPU" - profile_options: - image: - display_name: Image - choices: - tensorflow: - display_name: Pangeo Tensorflow ML Notebook - slug: "tensorflow" - kubespawner_override: - image: "pangeo/ml-notebook:b9584f6" - pytorch: - display_name: Pangeo PyTorch ML Notebook - default: true - slug: "pytorch" - kubespawner_override: - image: "pangeo/pytorch-notebook:b9584f6" kubespawner_override: mem_limit: null mem_guarantee: 14G From 6ce9ffced01ff1b66b35a2551a81480cc4673ad0 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:27:23 +0200 Subject: [PATCH 3/9] 2i2c-aws-us: put gpu related env vars in common values --- config/clusters/2i2c-aws-us/common.values.yaml | 4 ++++ config/clusters/2i2c-aws-us/researchdelight.values.yaml | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/clusters/2i2c-aws-us/common.values.yaml b/config/clusters/2i2c-aws-us/common.values.yaml index 6a16cbc9c4..30d7af29a2 100644 --- a/config/clusters/2i2c-aws-us/common.values.yaml +++ b/config/clusters/2i2c-aws-us/common.values.yaml @@ -4,6 +4,10 @@ basehub: userScheduler: enabled: true singleuser: + extraEnv: + # Temporarily set for *all* pods, including pods without any GPUs, + # to work around https://github.com/2i2c-org/infrastructure/issues/1530 + NVIDIA_DRIVER_CAPABILITIES: compute,utility profileList: # NOTE: About node sharing # diff --git a/config/clusters/2i2c-aws-us/researchdelight.values.yaml b/config/clusters/2i2c-aws-us/researchdelight.values.yaml index cc5689533f..7d13d88900 100644 --- a/config/clusters/2i2c-aws-us/researchdelight.values.yaml +++ b/config/clusters/2i2c-aws-us/researchdelight.values.yaml @@ -46,10 +46,6 @@ basehub: image: name: quay.io/2i2c/researchdelight-image tag: "872f0c4578af" - extraEnv: - # Temporarily set for *all* pods, including pods without any GPUs, - # to work around https://github.com/2i2c-org/infrastructure/issues/1530 - NVIDIA_DRIVER_CAPABILITIES: compute,utility hub: config: JupyterHub: From 85eff48101a19d429e74452a00ab0b38c726b17f Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:31:19 +0200 Subject: [PATCH 4/9] 2i2c-aws-us, ncar-cisl: add shared cluster daskhub --- config/clusters/2i2c-aws-us/cluster.yaml | 8 +++ .../enc-ncar-cisl.secret.values.yaml | 21 ++++++ .../2i2c-aws-us/ncar-cisl.values.yaml | 65 +++++++++++++++++++ eksctl/2i2c-aws-us.jsonnet | 2 +- terraform/aws/projects/2i2c-aws-us.tfvars | 8 +++ 5 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 config/clusters/2i2c-aws-us/enc-ncar-cisl.secret.values.yaml create mode 100644 config/clusters/2i2c-aws-us/ncar-cisl.values.yaml diff --git a/config/clusters/2i2c-aws-us/cluster.yaml b/config/clusters/2i2c-aws-us/cluster.yaml index b1d4c57ae3..62f921394c 100644 --- a/config/clusters/2i2c-aws-us/cluster.yaml +++ b/config/clusters/2i2c-aws-us/cluster.yaml @@ -33,3 +33,11 @@ hubs: - common.values.yaml - researchdelight.values.yaml - enc-researchdelight.secret.values.yaml + - name: ncar-cisl + display_name: "NCAR-CISL for UCAR" + domain: ncar-cisl.2i2c.cloud + helm_chart: daskhub + helm_chart_values_files: + - common.values.yaml + - ncar-cisl.values.yaml + - enc-ncar-cisl.secret.values.yaml diff --git a/config/clusters/2i2c-aws-us/enc-ncar-cisl.secret.values.yaml b/config/clusters/2i2c-aws-us/enc-ncar-cisl.secret.values.yaml new file mode 100644 index 0000000000..2b80143bad --- /dev/null +++ b/config/clusters/2i2c-aws-us/enc-ncar-cisl.secret.values.yaml @@ -0,0 +1,21 @@ +basehub: + jupyterhub: + hub: + config: + GitHubOAuthenticator: + client_id: ENC[AES256_GCM,data:w8TiVOOw38P4ZzVLlDjMwSg/31k=,iv:uWPJZpGMcOmOtsRtj+/P7NdnKxQt3fekJE5VrTx7cuY=,tag:Gh/vNCb74pE25FMyVSeJ5A==,type:str] + client_secret: ENC[AES256_GCM,data:cJGGF0B84gcdsyG5dxG4l5EelVdC95cXie6j4z4wdtLLKa7McjXETA==,iv:saFdxwLsn+MJy9jWUiuDVIq8prVAj2rLIn8cBcai5I4=,tag:gUtfeUBTrD61B+oT1B/lVw==,type:str] +sops: + kms: [] + gcp_kms: + - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs + created_at: "2023-05-29T06:00:51Z" + enc: CiUA4OM7eEzeTOTJHKWQmB1JmagZWrA4RFy2NgQrh0IRUhvfBqpgEkkAyiwFHHJQTkDoCjoAG9xqZImH9v4mc13lIh8qX4ixg5KHBC7RFL3jY7VglROpS+3lAYvxS+VtscuysmPKg72FVDD1EaGU59Do + azure_kv: [] + hc_vault: [] + age: [] + lastmodified: "2023-05-29T06:00:51Z" + mac: ENC[AES256_GCM,data:iAGbHj/lnENLexJVp7zWIXiYzReY0CWmU/OI1c/iz7Q7d3G+aYpNb1nP872mrSRMb6Ssojor2AqOWpVnBxUWzV29NAFT1hr2mf6d5LEnrB2g4Mg0ZqQhTw+gKi8niIR88jEd0/scuu+54CR874HJ3JBCePtY8Q31PXcreQtTrU8=,iv:TeEYZHZxTZ7rZwS7tWw8laramN+JJzRvxuJFH/xkXTM=,tag:7hVl2kLj7dnXZS/Pyc+5tg==,type:str] + pgp: [] + unencrypted_suffix: _unencrypted + version: 3.7.2 diff --git a/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml new file mode 100644 index 0000000000..cf82b0b12b --- /dev/null +++ b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml @@ -0,0 +1,65 @@ +basehub: + userServiceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::790657130469:role/2i2c-aws-us-ncar-cisl + nfs: + pv: + # from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html + mountOptions: + - rsize=1048576 + - wsize=1048576 + - timeo=600 + - soft # We pick soft over hard, so NFS lockups don't lead to hung processes + - retrans=2 + - noresvport + serverIP: fs-0b70db2b65209a77d.efs.us-west-2.amazonaws.com + baseShareName: / + jupyterhub: + ingress: + hosts: [ncar-cisl.2i2c.cloud] + tls: + - hosts: [ncar-cisl.2i2c.cloud] + secretName: https-auto-tls + custom: + 2i2c: + add_staff_user_ids_to_admin_users: true + add_staff_user_ids_of_type: "github" + homepage: + templateVars: + org: + name: NCAR-CISL for UCAR + url: https://www2.cisl.ucar.edu/ + logo_url: https://www.vmcdn.ca/f/files/longmontleader/import/2017_06_ncar_highres_transparent.png + designed_by: + name: 2i2c + url: https://2i2c.org + operated_by: + name: 2i2c + url: https://2i2c.org + funded_by: + name: NCAR-CISL for UCAR + url: https://www2.cisl.ucar.edu/ + hub: + config: + JupyterHub: + authenticator_class: github + GitHubOAuthenticator: + oauth_callback_url: https://ncar-cisl.2i2c.cloud/hub/oauth_callback + allowed_organizations: + - 2i2c-org + - NCAR:2i2c-cloud-users + scope: + - read:org + Authenticator: + admin_users: + - kcote-ncar # Ken Cote, Initial adminstrator + - NicholasCote # Nicholas Cote, Initial adminstrator + - nwehrheim # Nick Wehrheim, Community representative + singleuser: + image: + # image choice preliminary and is expected to be setup via + # https://ncar-cisl.2i2c.cloud/services/configurator/ by the community + # + # pangeo/pangeo-notebook is maintained at: https://github.com/pangeo-data/pangeo-docker-images + name: pangeo/pangeo-notebook + tag: "2023.05.18" diff --git a/eksctl/2i2c-aws-us.jsonnet b/eksctl/2i2c-aws-us.jsonnet index 3ea843262b..e5f21122fc 100644 --- a/eksctl/2i2c-aws-us.jsonnet +++ b/eksctl/2i2c-aws-us.jsonnet @@ -142,4 +142,4 @@ local daskNodes = [ } + n for n in daskNodes ] else [] ) -} \ No newline at end of file +} diff --git a/terraform/aws/projects/2i2c-aws-us.tfvars b/terraform/aws/projects/2i2c-aws-us.tfvars index c0e82b4902..d0c1243070 100644 --- a/terraform/aws/projects/2i2c-aws-us.tfvars +++ b/terraform/aws/projects/2i2c-aws-us.tfvars @@ -14,6 +14,9 @@ user_buckets = { "scratch-researchdelight": { "delete_after": 7 }, + "scratch-ncar-cisl": { + "delete_after": 7 + }, } @@ -33,4 +36,9 @@ hub_cloud_permissions = { bucket_admin_access: ["scratch-researchdelight"], extra_iam_policy: "" }, + "ncar-cisl" : { + requestor_pays: true, + bucket_admin_access: ["scratch-ncar-cisl"], + extra_iam_policy: "" + }, } From 367a0b2dcb1669e97049c5f5422edf2e362e0086 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:48:00 +0200 Subject: [PATCH 5/9] docs: fix node_selector for gpu setup example --- docs/howto/features/gpu.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/howto/features/gpu.md b/docs/howto/features/gpu.md index 20cfc7c70f..84d38b26fe 100644 --- a/docs/howto/features/gpu.md +++ b/docs/howto/features/gpu.md @@ -101,18 +101,18 @@ jupyterhub: display_name: Pangeo Tensorflow ML Notebook slug: "tensorflow" kubespawner_override: - node.kubernetes.io/instance-type: g4dn.xlarge image: "pangeo/ml-notebook:" pytorch: display_name: Pangeo PyTorch ML Notebook default: true slug: "pytorch" kubespawner_override: - node.kubernetes.io/instance-type: g4dn.xlarge image: "pangeo/pytorch-notebook:" kubespawner_override: mem_limit: null mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge extra_resource_limits: nvidia.com/gpu: "1" ``` From 8636626ea5325bceac2d49986af8c833ec0fdcd1 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 08:49:02 +0200 Subject: [PATCH 6/9] docs: avoid ambigious reference to Large in GPU setup example --- docs/howto/features/gpu.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/howto/features/gpu.md b/docs/howto/features/gpu.md index 84d38b26fe..77385d5c00 100644 --- a/docs/howto/features/gpu.md +++ b/docs/howto/features/gpu.md @@ -91,8 +91,8 @@ jupyterhub: # to work around https://github.com/2i2c-org/infrastructure/issues/1530 NVIDIA_DRIVER_CAPABILITIES: compute,utility profileList: - - display_name: Large + GPU - description: 14GB RAM, 4 CPUs, T4 GPU + - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs + description: "Start a container on a dedicated node with a GPU" profile_options: image: display_name: Image From 710bc23d5e63d8d52b06dca988d17454d12b7f54 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 10:31:25 +0200 Subject: [PATCH 7/9] 2i2c-aws-us: separate profileList to individual hubs --- .../clusters/2i2c-aws-us/common.values.yaml | 181 ----------------- .../2i2c-aws-us/ncar-cisl.values.yaml | 181 +++++++++++++++++ .../2i2c-aws-us/researchdelight.values.yaml | 189 +++++++++++++++++- 3 files changed, 366 insertions(+), 185 deletions(-) diff --git a/config/clusters/2i2c-aws-us/common.values.yaml b/config/clusters/2i2c-aws-us/common.values.yaml index 30d7af29a2..47f7817119 100644 --- a/config/clusters/2i2c-aws-us/common.values.yaml +++ b/config/clusters/2i2c-aws-us/common.values.yaml @@ -8,184 +8,3 @@ basehub: # Temporarily set for *all* pods, including pods without any GPUs, # to work around https://github.com/2i2c-org/infrastructure/issues/1530 NVIDIA_DRIVER_CAPABILITIES: compute,utility - profileList: - # NOTE: About node sharing - # - # CPU/Memory requests/limits are actively considered still. This - # profile list is setup to involve node sharing as considered in - # https://github.com/2i2c-org/infrastructure/issues/2121. - # - # - Memory requests are different from the description, based on: - # whats found to remain allocate in k8s, subtracting 1GiB - # overhead for misc system pods, and transitioning from GB in - # description to GiB in mem_guarantee. - # - CPU requests are lower than the description, with a factor of - # 10%. - # - - display_name: "Small: up to 4 CPU / 32 GB RAM" - description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type" - slug: small - default: true - profile_options: - requests: - # NOTE: Node share choices are in active development, see comment - # next to profileList: above. - display_name: Node share - choices: - mem_1: - default: true - display_name: ~1 GB, ~0.125 CPU - kubespawner_override: - mem_guarantee: 0.904G - cpu_guarantee: 0.013 - mem_2: - display_name: ~2 GB, ~0.25 CPU - kubespawner_override: - mem_guarantee: 1.809G - cpu_guarantee: 0.025 - mem_4: - display_name: ~4 GB, ~0.5 CPU - kubespawner_override: - mem_guarantee: 3.617G - cpu_guarantee: 0.05 - mem_8: - display_name: ~8 GB, ~1.0 CPU - kubespawner_override: - mem_guarantee: 7.234G - cpu_guarantee: 0.1 - mem_16: - display_name: ~16 GB, ~2.0 CPU - kubespawner_override: - mem_guarantee: 14.469G - cpu_guarantee: 0.2 - mem_32: - display_name: ~32 GB, ~4.0 CPU - kubespawner_override: - mem_guarantee: 28.937G - cpu_guarantee: 0.4 - kubespawner_override: - cpu_limit: null - mem_limit: null - node_selector: - node.kubernetes.io/instance-type: r5.xlarge - - display_name: "Medium: up to 16 CPU / 128 GB RAM" - description: *profile_list_description - slug: medium - profile_options: - requests: - # NOTE: Node share choices are in active development, see comment - # next to profileList: above. - display_name: Node share - choices: - mem_1: - display_name: ~1 GB, ~0.125 CPU - kubespawner_override: - mem_guarantee: 0.942G - cpu_guarantee: 0.013 - mem_2: - display_name: ~2 GB, ~0.25 CPU - kubespawner_override: - mem_guarantee: 1.883G - cpu_guarantee: 0.025 - mem_4: - default: true - display_name: ~4 GB, ~0.5 CPU - kubespawner_override: - mem_guarantee: 3.766G - cpu_guarantee: 0.05 - mem_8: - display_name: ~8 GB, ~1.0 CPU - kubespawner_override: - mem_guarantee: 7.532G - cpu_guarantee: 0.1 - mem_16: - display_name: ~16 GB, ~2.0 CPU - kubespawner_override: - mem_guarantee: 15.064G - cpu_guarantee: 0.2 - mem_32: - display_name: ~32 GB, ~4.0 CPU - kubespawner_override: - mem_guarantee: 30.128G - cpu_guarantee: 0.4 - mem_64: - display_name: ~64 GB, ~8.0 CPU - kubespawner_override: - mem_guarantee: 60.257G - cpu_guarantee: 0.8 - mem_128: - display_name: ~128 GB, ~16.0 CPU - kubespawner_override: - mem_guarantee: 120.513G - cpu_guarantee: 1.6 - kubespawner_override: - cpu_limit: null - mem_limit: null - node_selector: - node.kubernetes.io/instance-type: r5.4xlarge - - display_name: "Large: up to 64 CPU / 512 GB RAM" - description: *profile_list_description - slug: large - profile_options: - requests: - # NOTE: Node share choices are in active development, see comment - # next to profileList: above. - display_name: Node share - choices: - mem_4: - display_name: ~4 GB, ~0.5 CPU - kubespawner_override: - mem_guarantee: 3.821G - cpu_guarantee: 0.05 - mem_8: - display_name: ~8 GB, ~1.0 CPU - kubespawner_override: - mem_guarantee: 7.643G - cpu_guarantee: 0.1 - mem_16: - default: true - display_name: ~16 GB, ~2.0 CPU - kubespawner_override: - mem_guarantee: 15.285G - cpu_guarantee: 0.2 - mem_32: - display_name: ~32 GB, ~4.0 CPU - kubespawner_override: - mem_guarantee: 30.571G - cpu_guarantee: 0.4 - mem_64: - display_name: ~64 GB, ~8.0 CPU - kubespawner_override: - mem_guarantee: 61.141G - cpu_guarantee: 0.8 - mem_128: - display_name: ~128 GB, ~16.0 CPU - kubespawner_override: - mem_guarantee: 122.282G - cpu_guarantee: 1.6 - mem_256: - display_name: ~256 GB, ~32.0 CPU - kubespawner_override: - mem_guarantee: 244.565G - cpu_guarantee: 3.2 - mem_512: - display_name: ~512 GB, ~64.0 CPU - kubespawner_override: - mem_guarantee: 489.13G - cpu_guarantee: 6.4 - kubespawner_override: - cpu_limit: null - mem_limit: null - node_selector: - node.kubernetes.io/instance-type: r5.16xlarge - - - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs - slug: gpu - description: "Start a container on a dedicated node with a GPU" - kubespawner_override: - mem_limit: null - mem_guarantee: 14G - node_selector: - node.kubernetes.io/instance-type: g4dn.xlarge - extra_resource_limits: - nvidia.com/gpu: "1" diff --git a/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml index cf82b0b12b..1ad63998d5 100644 --- a/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml +++ b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml @@ -63,3 +63,184 @@ basehub: # pangeo/pangeo-notebook is maintained at: https://github.com/pangeo-data/pangeo-docker-images name: pangeo/pangeo-notebook tag: "2023.05.18" + profileList: + # NOTE: About node sharing + # + # CPU/Memory requests/limits are actively considered still. This + # profile list is setup to involve node sharing as considered in + # https://github.com/2i2c-org/infrastructure/issues/2121. + # + # - Memory requests are different from the description, based on: + # whats found to remain allocate in k8s, subtracting 1GiB + # overhead for misc system pods, and transitioning from GB in + # description to GiB in mem_guarantee. + # - CPU requests are lower than the description, with a factor of + # 10%. + # + - display_name: "Small: up to 4 CPU / 32 GB RAM" + description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type" + slug: small + default: true + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + default: true + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.904G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.809G + cpu_guarantee: 0.025 + mem_4: + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.617G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.234G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 14.469G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 28.937G + cpu_guarantee: 0.4 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.xlarge + - display_name: "Medium: up to 16 CPU / 128 GB RAM" + description: *profile_list_description + slug: medium + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.942G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.883G + cpu_guarantee: 0.025 + mem_4: + default: true + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.766G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.532G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 15.064G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 30.128G + cpu_guarantee: 0.4 + mem_64: + display_name: ~64 GB, ~8.0 CPU + kubespawner_override: + mem_guarantee: 60.257G + cpu_guarantee: 0.8 + mem_128: + display_name: ~128 GB, ~16.0 CPU + kubespawner_override: + mem_guarantee: 120.513G + cpu_guarantee: 1.6 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.4xlarge + - display_name: "Large: up to 64 CPU / 512 GB RAM" + description: *profile_list_description + slug: large + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_4: + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.821G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.643G + cpu_guarantee: 0.1 + mem_16: + default: true + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 15.285G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 30.571G + cpu_guarantee: 0.4 + mem_64: + display_name: ~64 GB, ~8.0 CPU + kubespawner_override: + mem_guarantee: 61.141G + cpu_guarantee: 0.8 + mem_128: + display_name: ~128 GB, ~16.0 CPU + kubespawner_override: + mem_guarantee: 122.282G + cpu_guarantee: 1.6 + mem_256: + display_name: ~256 GB, ~32.0 CPU + kubespawner_override: + mem_guarantee: 244.565G + cpu_guarantee: 3.2 + mem_512: + display_name: ~512 GB, ~64.0 CPU + kubespawner_override: + mem_guarantee: 489.13G + cpu_guarantee: 6.4 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.16xlarge + + - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs + slug: gpu + description: "Start a container on a dedicated node with a GPU" + kubespawner_override: + mem_limit: null + mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge + extra_resource_limits: + nvidia.com/gpu: "1" diff --git a/config/clusters/2i2c-aws-us/researchdelight.values.yaml b/config/clusters/2i2c-aws-us/researchdelight.values.yaml index 7d13d88900..ee910fd804 100644 --- a/config/clusters/2i2c-aws-us/researchdelight.values.yaml +++ b/config/clusters/2i2c-aws-us/researchdelight.values.yaml @@ -42,10 +42,6 @@ basehub: funded_by: name: 2i2c url: https://2i2c.org - singleuser: - image: - name: quay.io/2i2c/researchdelight-image - tag: "872f0c4578af" hub: config: JupyterHub: @@ -58,3 +54,188 @@ basehub: # Only show the option to login with GitHub shown_idps: - http://github.com/login/oauth/authorize + singleuser: + image: + name: quay.io/2i2c/researchdelight-image + tag: "872f0c4578af" + profileList: + # NOTE: About node sharing + # + # CPU/Memory requests/limits are actively considered still. This + # profile list is setup to involve node sharing as considered in + # https://github.com/2i2c-org/infrastructure/issues/2121. + # + # - Memory requests are different from the description, based on: + # whats found to remain allocate in k8s, subtracting 1GiB + # overhead for misc system pods, and transitioning from GB in + # description to GiB in mem_guarantee. + # - CPU requests are lower than the description, with a factor of + # 10%. + # + - display_name: "Small: up to 4 CPU / 32 GB RAM" + description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type" + slug: small + default: true + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + default: true + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.904G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.809G + cpu_guarantee: 0.025 + mem_4: + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.617G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.234G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 14.469G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 28.937G + cpu_guarantee: 0.4 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.xlarge + - display_name: "Medium: up to 16 CPU / 128 GB RAM" + description: *profile_list_description + slug: medium + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.942G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.883G + cpu_guarantee: 0.025 + mem_4: + default: true + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.766G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.532G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 15.064G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 30.128G + cpu_guarantee: 0.4 + mem_64: + display_name: ~64 GB, ~8.0 CPU + kubespawner_override: + mem_guarantee: 60.257G + cpu_guarantee: 0.8 + mem_128: + display_name: ~128 GB, ~16.0 CPU + kubespawner_override: + mem_guarantee: 120.513G + cpu_guarantee: 1.6 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.4xlarge + - display_name: "Large: up to 64 CPU / 512 GB RAM" + description: *profile_list_description + slug: large + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_4: + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.821G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.643G + cpu_guarantee: 0.1 + mem_16: + default: true + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 15.285G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 30.571G + cpu_guarantee: 0.4 + mem_64: + display_name: ~64 GB, ~8.0 CPU + kubespawner_override: + mem_guarantee: 61.141G + cpu_guarantee: 0.8 + mem_128: + display_name: ~128 GB, ~16.0 CPU + kubespawner_override: + mem_guarantee: 122.282G + cpu_guarantee: 1.6 + mem_256: + display_name: ~256 GB, ~32.0 CPU + kubespawner_override: + mem_guarantee: 244.565G + cpu_guarantee: 3.2 + mem_512: + display_name: ~512 GB, ~64.0 CPU + kubespawner_override: + mem_guarantee: 489.13G + cpu_guarantee: 6.4 + kubespawner_override: + cpu_limit: null + mem_limit: null + node_selector: + node.kubernetes.io/instance-type: r5.16xlarge + + - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs + slug: gpu + description: "Start a container on a dedicated node with a GPU" + kubespawner_override: + mem_limit: null + mem_guarantee: 14G + node_selector: + node.kubernetes.io/instance-type: g4dn.xlarge + extra_resource_limits: + nvidia.com/gpu: "1" From 4aed868f2116ca1748a1e1bd64afebd47d69ed7d Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 10:32:10 +0200 Subject: [PATCH 8/9] 2i2c-aws-us, researchdelight: revert remove gpu image choices --- .../2i2c-aws-us/researchdelight.values.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/config/clusters/2i2c-aws-us/researchdelight.values.yaml b/config/clusters/2i2c-aws-us/researchdelight.values.yaml index ee910fd804..d8a1fbafe2 100644 --- a/config/clusters/2i2c-aws-us/researchdelight.values.yaml +++ b/config/clusters/2i2c-aws-us/researchdelight.values.yaml @@ -232,6 +232,21 @@ basehub: - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs slug: gpu description: "Start a container on a dedicated node with a GPU" + profile_options: + image: + display_name: Image + choices: + tensorflow: + display_name: Pangeo Tensorflow ML Notebook + slug: "tensorflow" + kubespawner_override: + image: "pangeo/ml-notebook:b9584f6" + pytorch: + display_name: Pangeo PyTorch ML Notebook + default: true + slug: "pytorch" + kubespawner_override: + image: "pangeo/pytorch-notebook:b9584f6" kubespawner_override: mem_limit: null mem_guarantee: 14G From e35dcd1f584931ef644a31f57264024cdd55499c Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 29 May 2023 10:33:42 +0200 Subject: [PATCH 9/9] 2i2c-aws-us, ncar-cisl: add tf/pytorch notebooks for gpu node --- config/clusters/2i2c-aws-us/ncar-cisl.values.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml index 1ad63998d5..8fb0ccb402 100644 --- a/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml +++ b/config/clusters/2i2c-aws-us/ncar-cisl.values.yaml @@ -237,6 +237,21 @@ basehub: - display_name: NVIDIA Tesla T4, ~16 GB, ~4 CPUs slug: gpu description: "Start a container on a dedicated node with a GPU" + profile_options: + image: + display_name: Image + choices: + tensorflow: + display_name: Pangeo Tensorflow ML Notebook + slug: "tensorflow" + kubespawner_override: + image: "pangeo/ml-notebook:2023.05.18" + pytorch: + display_name: Pangeo PyTorch ML Notebook + default: true + slug: "pytorch" + kubespawner_override: + image: "pangeo/pytorch-notebook:2023.05.18" kubespawner_override: mem_limit: null mem_guarantee: 14G