Skip to content

Commit

Permalink
Merge pull request #5022 from GeorgianaElena/victor-cost
Browse files Browse the repository at this point in the history
[victor] Upgrade k8s version, separate nodepools per hub, enable cost-allocation
  • Loading branch information
GeorgianaElena authored Nov 1, 2024
2 parents df02bbe + 1b0f8f8 commit 4d89929
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 18 deletions.
2 changes: 2 additions & 0 deletions config/clusters/victor/prod.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ basehub:
CILogonOAuthenticator:
oauth_callback_url: https://hub.victorproject.org/hub/oauth_callback
singleuser:
nodeSelector:
2i2c/hub-name: prod
profileList:
# IMPORTANT: Staging and prod's profileList's are meant to be kept
# equivalent with the exception that staging adds
Expand Down
2 changes: 2 additions & 0 deletions config/clusters/victor/staging.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ basehub:
CILogonOAuthenticator:
oauth_callback_url: https://staging.hub.victorproject.org/hub/oauth_callback
singleuser:
nodeSelector:
2i2c/hub-name: staging
profileList:
# IMPORTANT: Staging and prod's profileList's are meant to be kept
# equivalent with the exception that staging adds
Expand Down
8 changes: 8 additions & 0 deletions config/clusters/victor/support.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ prometheus:
hosts:
- prometheus.victor.2i2c.cloud

aws-ce-grafana-backend:
enabled: true
envBasedConfig:
clusterName: victor
serviceAccount:
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::129856558350:role/aws_ce_grafana_backend_iam_role

redirects:
rules:
- from: victor.2i2c.cloud
Expand Down
94 changes: 78 additions & 16 deletions eksctl/victor.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,71 @@ local nodeAz = "us-west-2a";
// A `node.kubernetes.io/instance-type label is added, so pods
// can request a particular kind of node with a nodeSelector
local notebookNodes = [
{ instanceType: "r5.xlarge" },
{ instanceType: "r5.4xlarge" },
{ instanceType: "r5.16xlarge" },
{
instanceType: "g4dn.xlarge",
tags+: {
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
instanceType: "r5.xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" }
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" }
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" }
},
{
instanceType: "r5.xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" }
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" }
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" }
},
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: {
"2i2c:hub-name": "staging",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: {
"2i2c:hub-name": "prod",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
];

Expand All @@ -53,7 +104,18 @@ local daskNodes = [
// A not yet fully established policy is being developed about using a single
// node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
//
{ instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
{
namePrefix: "dask-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
{
namePrefix: "dask-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
];


Expand All @@ -63,7 +125,7 @@ local daskNodes = [
metadata+: {
name: "victor",
region: clusterRegion,
version: "1.29",
version: "1.30",
},
availabilityZones: masterAzs,
iam: {
Expand Down Expand Up @@ -94,7 +156,7 @@ local daskNodes = [
[
ng + {
namePrefix: 'core',
nameSuffix: 'b',
nameSuffix: 'a',
nameIncludeInstanceType: false,
availabilityZones: [nodeAz],
ssh: {
Expand Down
8 changes: 6 additions & 2 deletions terraform/aws/projects/victor.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@ region = "us-west-2"
cluster_name = "victor"
cluster_nodes_location = "us-west-2a"

enable_aws_ce_grafana_backend_iam = true

user_buckets = {
"scratch-staging" : {
"delete_after" : 7
"delete_after" : 7,
"tags" : { "2i2c:hub-name" : "staging" }
},
"scratch" : {
"delete_after" : 7
"delete_after" : 7,
"tags" : { "2i2c:hub-name" : "prod" }
},
}

Expand Down

0 comments on commit 4d89929

Please sign in to comment.