From 487669f0f350558785e6a849173cdbd2e4044ec6 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 19 Mar 2023 20:18:34 +0100 Subject: [PATCH 1/4] nasa-cryo: k8s 1.22 to 1.25, node sharing --- eksctl/nasa-cryo.jsonnet | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/eksctl/nasa-cryo.jsonnet b/eksctl/nasa-cryo.jsonnet index 74b6df443b..4ea828f38d 100644 --- a/eksctl/nasa-cryo.jsonnet +++ b/eksctl/nasa-cryo.jsonnet @@ -25,10 +25,9 @@ local nodeAz = "us-west-2a"; // A `node.kubernetes.io/instance-type label is added, so pods // can request a particular kind of node with a nodeSelector local notebookNodes = [ - { instanceType: "m5.large" }, - { instanceType: "m5.xlarge" }, - { instanceType: "m5.2xlarge" }, - { instanceType: "m5.8xlarge" }, + { instanceType: "r5.xlarge" }, + { instanceType: "r5.4xlarge" }, + { instanceType: "r5.16xlarge" }, ]; local daskNodes = [ @@ -38,10 +37,7 @@ local daskNodes = [ // *first* item in instanceDistribution.instanceTypes, to match // what we do with notebook nodes. Pods can request a particular // kind of node with a nodeSelector - { instancesDistribution+: { instanceTypes: ["m5.large"] }}, - { instancesDistribution+: { instanceTypes: ["m5.xlarge"] }}, - { instancesDistribution+: { instanceTypes: ["m5.2xlarge"] }}, - { instancesDistribution+: { instanceTypes: ["m5.8xlarge"] }}, + { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }}, ]; @@ -51,7 +47,7 @@ local daskNodes = [ metadata+: { name: "nasa-cryo", region: clusterRegion, - version: '1.22' + version: '1.25' }, availabilityZones: masterAzs, iam: { @@ -83,7 +79,7 @@ local daskNodes = [ ssh: { publicKeyPath: 'ssh-keys/nasa-cryo.key.pub' }, - instanceType: "m5.xlarge", + instanceType: "r5.xlarge", minSize: 1, maxSize: 6, labels+: { @@ -138,6 +134,4 @@ local daskNodes = [ }, } + n for n in daskNodes ] - - -} \ No newline at end of file +} From 1237f32131f54001657c15e177c71efc4ca1f894 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 19 Mar 2023 22:44:32 +0100 Subject: [PATCH 2/4] nasa-cryo: profile list for node sharing, small + medium for advanced users --- config/clusters/nasa-cryo/common.values.yaml | 207 ++++++++++++++++--- 1 file changed, 174 insertions(+), 33 deletions(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 6e33813e16..55dcea5654 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -95,53 +95,194 @@ basehub: subPath: _shared readOnly: true profileList: - # The mem-guarantees are here so k8s doesn't schedule other pods - # on these nodes. - - display_name: "Small: m5.large" - description: "~2 CPU, ~8G RAM" + # NOTE: About node sharing + # + # CPU/Memory requests/limits are actively considered still. This + # profile list is setup to involve node sharing as considered in + # https://github.com/2i2c-org/infrastructure/issues/2121. + # + # - Memory requests are different from the description, based on: + # whats found to remain allocate in k8s, subtracting 1GiB + # overhead for misc system pods, and transitioning from GB in + # description to GiB in mem_guarantee. + # - CPU requests are lower than the description, with a factor of + # 10%. + # + - display_name: "Small: up to 4 CPU / 32 GB RAM" + description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type" + slug: small default: true allowed_teams: - 2i2c-org:hub-access-for-2i2c-staff - CryoInTheCloud:cryoclouduser - CryoInTheCloud:cryocloudadvanced + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + default: true + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.904G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.809G + cpu_guarantee: 0.025 + mem_4: + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.617G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.234G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 14.469G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 28.937G + cpu_guarantee: 0.4 kubespawner_override: - # Explicitly unset mem_limit, so it overrides the default memory limit we set in - # basehub/values.yaml + cpu_limit: null mem_limit: null - mem_guarantee: 6.5G node_selector: - node.kubernetes.io/instance-type: m5.large - - display_name: "Medium: m5.xlarge" - description: "~4 CPU, ~15G RAM" - allowed_teams: - - 2i2c-org:hub-access-for-2i2c-staff - - CryoInTheCloud:cryoclouduser - - CryoInTheCloud:cryocloudadvanced - kubespawner_override: - mem_limit: null - mem_guarantee: 12G - node_selector: - node.kubernetes.io/instance-type: m5.xlarge - - display_name: "Large: m5.2xlarge" - description: "~8 CPU, ~30G RAM" - allowed_teams: - - 2i2c-org:hub-access-for-2i2c-staff - - CryoInTheCloud:cryocloudadvanced - kubespawner_override: - mem_limit: null - mem_guarantee: 26G - node_selector: - node.kubernetes.io/instance-type: m5.2xlarge - - display_name: "Huge: m5.8xlarge" - description: "~32 CPU, ~128G RAM" + node.kubernetes.io/instance-type: r5.xlarge + + - display_name: "Medium: up to 16 CPU / 128 GB RAM" + description: *profile_list_description + slug: medium allowed_teams: - 2i2c-org:hub-access-for-2i2c-staff - CryoInTheCloud:cryocloudadvanced + profile_options: + requests: + # NOTE: Node share choices are in active development, see comment + # next to profileList: above. + display_name: Node share + choices: + mem_1: + display_name: ~1 GB, ~0.125 CPU + kubespawner_override: + mem_guarantee: 0.942G + cpu_guarantee: 0.013 + mem_2: + display_name: ~2 GB, ~0.25 CPU + kubespawner_override: + mem_guarantee: 1.883G + cpu_guarantee: 0.025 + mem_4: + default: true + display_name: ~4 GB, ~0.5 CPU + kubespawner_override: + mem_guarantee: 3.766G + cpu_guarantee: 0.05 + mem_8: + display_name: ~8 GB, ~1.0 CPU + kubespawner_override: + mem_guarantee: 7.532G + cpu_guarantee: 0.1 + mem_16: + display_name: ~16 GB, ~2.0 CPU + kubespawner_override: + mem_guarantee: 15.064G + cpu_guarantee: 0.2 + mem_32: + display_name: ~32 GB, ~4.0 CPU + kubespawner_override: + mem_guarantee: 30.128G + cpu_guarantee: 0.4 + mem_64: + display_name: ~64 GB, ~8.0 CPU + kubespawner_override: + mem_guarantee: 60.257G + cpu_guarantee: 0.8 + mem_128: + display_name: ~128 GB, ~16.0 CPU + kubespawner_override: + mem_guarantee: 120.513G + cpu_guarantee: 1.6 kubespawner_override: + cpu_limit: null mem_limit: null - mem_guarantee: 115G node_selector: - node.kubernetes.io/instance-type: m5.8xlarge + node.kubernetes.io/instance-type: r5.4xlarge + + # NOTE: The large option is added as a comment for now. It may be that + # its relevant in the future for advanced users having a workshop, + # and then its possible to enable more easily. + # + # This setup was discussed with Tasha Snow in March 2023 at + # https://2i2c.freshdesk.com/a/tickets/543. + # + # - display_name: "Large: up to 64 CPU / 512 GB RAM" + # description: *profile_list_description + # slug: large + # allowed_teams: + # - 2i2c-org:hub-access-for-2i2c-staff + # - CryoInTheCloud:cryocloudadvanced + # profile_options: + # requests: + # # NOTE: Node share choices are in active development, see comment + # # next to profileList: above. + # display_name: Node share + # choices: + # mem_4: + # display_name: ~4 GB, ~0.5 CPU + # kubespawner_override: + # mem_guarantee: 3.821G + # cpu_guarantee: 0.05 + # mem_8: + # display_name: ~8 GB, ~1.0 CPU + # kubespawner_override: + # mem_guarantee: 7.643G + # cpu_guarantee: 0.1 + # mem_16: + # default: true + # display_name: ~16 GB, ~2.0 CPU + # kubespawner_override: + # mem_guarantee: 15.285G + # cpu_guarantee: 0.2 + # mem_32: + # display_name: ~32 GB, ~4.0 CPU + # kubespawner_override: + # mem_guarantee: 30.571G + # cpu_guarantee: 0.4 + # mem_64: + # display_name: ~64 GB, ~8.0 CPU + # kubespawner_override: + # mem_guarantee: 61.141G + # cpu_guarantee: 0.8 + # mem_128: + # display_name: ~128 GB, ~16.0 CPU + # kubespawner_override: + # mem_guarantee: 122.282G + # cpu_guarantee: 1.6 + # mem_256: + # display_name: ~256 GB, ~32.0 CPU + # kubespawner_override: + # mem_guarantee: 244.565G + # cpu_guarantee: 3.2 + # mem_512: + # display_name: ~512 GB, ~64.0 CPU + # kubespawner_override: + # mem_guarantee: 489.13G + # cpu_guarantee: 6.4 + # kubespawner_override: + # cpu_limit: null + # mem_limit: null + # node_selector: + # node.kubernetes.io/instance-type: r5.16xlarge scheduling: userScheduler: enabled: true From a3ac1effc7f3c60f6ab56012692f5099e1bf2861 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 19 Mar 2023 22:44:50 +0100 Subject: [PATCH 3/4] nasa-cryo: comment out maintenance announcement --- config/clusters/nasa-cryo/common.values.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 55dcea5654..6afef3c914 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -63,15 +63,15 @@ basehub: # ref: https://github.com/2i2c-org/infrastructure/issues/1501 # ref: https://jupyterhub.readthedocs.io/en/stable/reference/templates.html#announcement-configuration-variables # - template_vars: - announcement: >- - - Service maintenance is scheduled Sunday March 19, to Monday 8AM - EST. - -
- Running servers may be forcefully stopped and service disruption - is expected. + # template_vars: + # announcement: >- + # + # Service maintenance is scheduled Sunday March 19, to Monday 8AM + # EST. + # + #
+ # Running servers may be forcefully stopped and service disruption + # is expected. GitHubOAuthenticator: # We are restricting profiles based on GitHub Team membership and # so need to populate the teams in the auth state From 1ea720f3183c87f0c799fa7e22c950e60b51bc2c Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 20 Mar 2023 13:28:53 +0100 Subject: [PATCH 4/4] nasa-cryo: remove commented out maintenance announcement --- config/clusters/nasa-cryo/common.values.yaml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 6afef3c914..ccae60f6f2 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -56,22 +56,6 @@ basehub: - fperez JupyterHub: authenticator_class: github - # Announcement is a JupyterHub feature to present messages to users in - # web pages under the /hub path (JupyterHub responds), but not via the - # /user path (single-user server responds). - # - # ref: https://github.com/2i2c-org/infrastructure/issues/1501 - # ref: https://jupyterhub.readthedocs.io/en/stable/reference/templates.html#announcement-configuration-variables - # - # template_vars: - # announcement: >- - # - # Service maintenance is scheduled Sunday March 19, to Monday 8AM - # EST. - # - #
- # Running servers may be forcefully stopped and service disruption - # is expected. GitHubOAuthenticator: # We are restricting profiles based on GitHub Team membership and # so need to populate the teams in the auth state