Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nasa-cryo: k8s 1.22 to 1.25, node sharing setup #2374

Merged
merged 4 commits into from
Mar 20, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 183 additions & 42 deletions config/clusters/nasa-cryo/common.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ basehub:
# ref: https://github.com/2i2c-org/infrastructure/issues/1501
# ref: https://jupyterhub.readthedocs.io/en/stable/reference/templates.html#announcement-configuration-variables
#
template_vars:
announcement: >-
<strong>
Service maintenance is scheduled Sunday March 19, to Monday 8AM
EST.
</strong>
<br/>
Running servers may be forcefully stopped and service disruption
is expected.
# template_vars:
# announcement: >-
# <strong>
# Service maintenance is scheduled Sunday March 19, to Monday 8AM
# EST.
# </strong>
# <br/>
# Running servers may be forcefully stopped and service disruption
# is expected.
GitHubOAuthenticator:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to leave this commented in? I'd suggest adding an example to the docs maybe under SRE Guilde>Support Tasks or How-to to set maintenance announcements rather than leaving commented out code lying around.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed in 1ea720f.

Makes sense to document this, I'm opening an issue about it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #2379!

# We are restricting profiles based on GitHub Team membership and
# so need to populate the teams in the auth state
Expand All @@ -95,53 +95,194 @@ basehub:
subPath: _shared
readOnly: true
profileList:
# The mem-guarantees are here so k8s doesn't schedule other pods
# on these nodes.
- display_name: "Small: m5.large"
description: "~2 CPU, ~8G RAM"
# NOTE: About node sharing
#
# CPU/Memory requests/limits are actively considered still. This
# profile list is setup to involve node sharing as considered in
# https://github.com/2i2c-org/infrastructure/issues/2121.
#
# - Memory requests are different from the description, based on:
# whats found to remain allocate in k8s, subtracting 1GiB
# overhead for misc system pods, and transitioning from GB in
# description to GiB in mem_guarantee.
# - CPU requests are lower than the description, with a factor of
# 10%.
#
- display_name: "Small: up to 4 CPU / 32 GB RAM"
description: &profile_list_description "Start a container with at least a chosen share of capacity on a node of this type"
slug: small
default: true
allowed_teams:
- 2i2c-org:hub-access-for-2i2c-staff
- CryoInTheCloud:cryoclouduser
- CryoInTheCloud:cryocloudadvanced
profile_options:
requests:
# NOTE: Node share choices are in active development, see comment
# next to profileList: above.
display_name: Node share
choices:
mem_1:
default: true
display_name: ~1 GB, ~0.125 CPU
kubespawner_override:
mem_guarantee: 0.904G
cpu_guarantee: 0.013
mem_2:
display_name: ~2 GB, ~0.25 CPU
kubespawner_override:
mem_guarantee: 1.809G
cpu_guarantee: 0.025
mem_4:
display_name: ~4 GB, ~0.5 CPU
kubespawner_override:
mem_guarantee: 3.617G
cpu_guarantee: 0.05
mem_8:
display_name: ~8 GB, ~1.0 CPU
kubespawner_override:
mem_guarantee: 7.234G
cpu_guarantee: 0.1
mem_16:
display_name: ~16 GB, ~2.0 CPU
kubespawner_override:
mem_guarantee: 14.469G
cpu_guarantee: 0.2
mem_32:
display_name: ~32 GB, ~4.0 CPU
kubespawner_override:
mem_guarantee: 28.937G
cpu_guarantee: 0.4
kubespawner_override:
# Explicitly unset mem_limit, so it overrides the default memory limit we set in
# basehub/values.yaml
cpu_limit: null
mem_limit: null
mem_guarantee: 6.5G
node_selector:
node.kubernetes.io/instance-type: m5.large
- display_name: "Medium: m5.xlarge"
description: "~4 CPU, ~15G RAM"
allowed_teams:
- 2i2c-org:hub-access-for-2i2c-staff
- CryoInTheCloud:cryoclouduser
- CryoInTheCloud:cryocloudadvanced
kubespawner_override:
mem_limit: null
mem_guarantee: 12G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Large: m5.2xlarge"
description: "~8 CPU, ~30G RAM"
allowed_teams:
- 2i2c-org:hub-access-for-2i2c-staff
- CryoInTheCloud:cryocloudadvanced
kubespawner_override:
mem_limit: null
mem_guarantee: 26G
node_selector:
node.kubernetes.io/instance-type: m5.2xlarge
- display_name: "Huge: m5.8xlarge"
description: "~32 CPU, ~128G RAM"
node.kubernetes.io/instance-type: r5.xlarge

- display_name: "Medium: up to 16 CPU / 128 GB RAM"
description: *profile_list_description
slug: medium
allowed_teams:
- 2i2c-org:hub-access-for-2i2c-staff
- CryoInTheCloud:cryocloudadvanced
profile_options:
requests:
# NOTE: Node share choices are in active development, see comment
# next to profileList: above.
display_name: Node share
choices:
mem_1:
display_name: ~1 GB, ~0.125 CPU
kubespawner_override:
mem_guarantee: 0.942G
cpu_guarantee: 0.013
mem_2:
display_name: ~2 GB, ~0.25 CPU
kubespawner_override:
mem_guarantee: 1.883G
cpu_guarantee: 0.025
mem_4:
default: true
display_name: ~4 GB, ~0.5 CPU
kubespawner_override:
mem_guarantee: 3.766G
cpu_guarantee: 0.05
mem_8:
display_name: ~8 GB, ~1.0 CPU
kubespawner_override:
mem_guarantee: 7.532G
cpu_guarantee: 0.1
mem_16:
display_name: ~16 GB, ~2.0 CPU
kubespawner_override:
mem_guarantee: 15.064G
cpu_guarantee: 0.2
mem_32:
display_name: ~32 GB, ~4.0 CPU
kubespawner_override:
mem_guarantee: 30.128G
cpu_guarantee: 0.4
mem_64:
display_name: ~64 GB, ~8.0 CPU
kubespawner_override:
mem_guarantee: 60.257G
cpu_guarantee: 0.8
mem_128:
display_name: ~128 GB, ~16.0 CPU
kubespawner_override:
mem_guarantee: 120.513G
cpu_guarantee: 1.6
kubespawner_override:
cpu_limit: null
mem_limit: null
mem_guarantee: 115G
node_selector:
node.kubernetes.io/instance-type: m5.8xlarge
node.kubernetes.io/instance-type: r5.4xlarge

# NOTE: The large option is added as a comment for now. It may be that
# its relevant in the future for advanced users having a workshop,
# and then its possible to enable more easily.
#
# This setup was discussed with Tasha Snow in March 2023 at
# https://2i2c.freshdesk.com/a/tickets/543.
#
# - display_name: "Large: up to 64 CPU / 512 GB RAM"
# description: *profile_list_description
# slug: large
pnasrat marked this conversation as resolved.
Show resolved Hide resolved
# allowed_teams:
# - 2i2c-org:hub-access-for-2i2c-staff
# - CryoInTheCloud:cryocloudadvanced
# profile_options:
# requests:
# # NOTE: Node share choices are in active development, see comment
# # next to profileList: above.
# display_name: Node share
# choices:
# mem_4:
# display_name: ~4 GB, ~0.5 CPU
# kubespawner_override:
# mem_guarantee: 3.821G
# cpu_guarantee: 0.05
# mem_8:
# display_name: ~8 GB, ~1.0 CPU
# kubespawner_override:
# mem_guarantee: 7.643G
# cpu_guarantee: 0.1
# mem_16:
# default: true
# display_name: ~16 GB, ~2.0 CPU
# kubespawner_override:
# mem_guarantee: 15.285G
# cpu_guarantee: 0.2
# mem_32:
# display_name: ~32 GB, ~4.0 CPU
# kubespawner_override:
# mem_guarantee: 30.571G
# cpu_guarantee: 0.4
# mem_64:
# display_name: ~64 GB, ~8.0 CPU
# kubespawner_override:
# mem_guarantee: 61.141G
# cpu_guarantee: 0.8
# mem_128:
# display_name: ~128 GB, ~16.0 CPU
# kubespawner_override:
# mem_guarantee: 122.282G
# cpu_guarantee: 1.6
# mem_256:
# display_name: ~256 GB, ~32.0 CPU
# kubespawner_override:
# mem_guarantee: 244.565G
# cpu_guarantee: 3.2
# mem_512:
# display_name: ~512 GB, ~64.0 CPU
# kubespawner_override:
# mem_guarantee: 489.13G
# cpu_guarantee: 6.4
# kubespawner_override:
# cpu_limit: null
# mem_limit: null
# node_selector:
# node.kubernetes.io/instance-type: r5.16xlarge
scheduling:
userScheduler:
enabled: true
20 changes: 7 additions & 13 deletions eksctl/nasa-cryo.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@ local nodeAz = "us-west-2a";
// A `node.kubernetes.io/instance-type label is added, so pods
// can request a particular kind of node with a nodeSelector
local notebookNodes = [
{ instanceType: "m5.large" },
{ instanceType: "m5.xlarge" },
{ instanceType: "m5.2xlarge" },
{ instanceType: "m5.8xlarge" },
{ instanceType: "r5.xlarge" },
{ instanceType: "r5.4xlarge" },
{ instanceType: "r5.16xlarge" },
];

local daskNodes = [
Expand All @@ -38,10 +37,7 @@ local daskNodes = [
// *first* item in instanceDistribution.instanceTypes, to match
// what we do with notebook nodes. Pods can request a particular
// kind of node with a nodeSelector
{ instancesDistribution+: { instanceTypes: ["m5.large"] }},
{ instancesDistribution+: { instanceTypes: ["m5.xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["m5.2xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["m5.8xlarge"] }},
{ instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
];


Expand All @@ -51,7 +47,7 @@ local daskNodes = [
metadata+: {
name: "nasa-cryo",
region: clusterRegion,
version: '1.22'
version: '1.25'
},
availabilityZones: masterAzs,
iam: {
Expand Down Expand Up @@ -83,7 +79,7 @@ local daskNodes = [
ssh: {
publicKeyPath: 'ssh-keys/nasa-cryo.key.pub'
},
instanceType: "m5.xlarge",
instanceType: "r5.xlarge",
minSize: 1,
maxSize: 6,
labels+: {
Expand Down Expand Up @@ -138,6 +134,4 @@ local daskNodes = [
},
} + n for n in daskNodes
]


}
}