Merge pull request #5022 from GeorgianaElena/victor-cost

[victor] Upgrade k8s version, separate nodepools per hub, enable cost-allocation
2i2c-org · Nov 1, 2024 · 4d89929 · 4d89929
2 parents df02bbe + 1b0f8f8
commit 4d89929
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 18 deletions.
diff --git a/config/clusters/victor/prod.values.yaml b/config/clusters/victor/prod.values.yaml
@@ -14,6 +14,8 @@ basehub:
         CILogonOAuthenticator:
           oauth_callback_url: https://hub.victorproject.org/hub/oauth_callback
     singleuser:
+      nodeSelector:
+        2i2c/hub-name: prod
       profileList:
         # IMPORTANT: Staging and prod's profileList's are meant to be kept
         #            equivalent with the exception that staging adds

diff --git a/config/clusters/victor/staging.values.yaml b/config/clusters/victor/staging.values.yaml
@@ -14,6 +14,8 @@ basehub:
         CILogonOAuthenticator:
           oauth_callback_url: https://staging.hub.victorproject.org/hub/oauth_callback
     singleuser:
+      nodeSelector:
+        2i2c/hub-name: staging
       profileList:
         # IMPORTANT: Staging and prod's profileList's are meant to be kept
         #            equivalent with the exception that staging adds

diff --git a/config/clusters/victor/support.values.yaml b/config/clusters/victor/support.values.yaml
@@ -33,6 +33,14 @@ prometheus:
           hosts:
             - prometheus.victor.2i2c.cloud
 
+aws-ce-grafana-backend:
+  enabled: true
+  envBasedConfig:
+    clusterName: victor
+  serviceAccount:
+    annotations:
+      eks.amazonaws.com/role-arn: arn:aws:iam::129856558350:role/aws_ce_grafana_backend_iam_role
+
 redirects:
   rules:
     - from: victor.2i2c.cloud

diff --git a/eksctl/victor.jsonnet b/eksctl/victor.jsonnet
@@ -25,20 +25,71 @@ local nodeAz = "us-west-2a";
 // A `node.kubernetes.io/instance-type label is added, so pods
 // can request a particular kind of node with a nodeSelector
 local notebookNodes = [
-    { instanceType: "r5.xlarge" },
-    { instanceType: "r5.4xlarge" },
-    { instanceType: "r5.16xlarge" },
     {
-        instanceType: "g4dn.xlarge",
-        tags+: {
-            "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
-        },
-        taints+: {
-            "nvidia.com/gpu": "present:NoSchedule"
-        },
-        // Allow provisioning GPUs across all AZs, to prevent situation where all
-        // GPUs in a single AZ are in use and no new nodes can be spawned
-        availabilityZones: masterAzs,
+        instanceType: "r5.xlarge",
+        namePrefix: "nb-staging",
+        labels+: { "2i2c/hub-name": "staging" },
+        tags+: { "2i2c:hub-name": "staging" }
+    },
+    {
+        instanceType: "r5.4xlarge",
+        namePrefix: "nb-staging",
+        labels+: { "2i2c/hub-name": "staging" },
+        tags+: { "2i2c:hub-name": "staging" }
+    },
+    {
+        instanceType: "r5.16xlarge",
+        namePrefix: "nb-staging",
+        labels+: { "2i2c/hub-name": "staging" },
+        tags+: { "2i2c:hub-name": "staging" }
+    },
+    {
+        instanceType: "r5.xlarge",
+        namePrefix: "nb-prod",
+        labels+: { "2i2c/hub-name": "prod" },
+        tags+: { "2i2c:hub-name": "prod" }
+    },
+    {
+        instanceType: "r5.4xlarge",
+        namePrefix: "nb-prod",
+        labels+: { "2i2c/hub-name": "prod" },
+        tags+: { "2i2c:hub-name": "prod" }
+    },
+    {
+        instanceType: "r5.16xlarge",
+        namePrefix: "nb-prod",
+        labels+: { "2i2c/hub-name": "prod" },
+        tags+: { "2i2c:hub-name": "prod" }
+    },
+    {
+      instanceType: "g4dn.xlarge",
+      namePrefix: "gpu-staging",
+      labels+: { "2i2c/hub-name": "staging" },
+      tags+: {
+        "2i2c:hub-name": "staging",
+        "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
+      },
+      taints+: {
+        "nvidia.com/gpu": "present:NoSchedule"
+      },
+      // Allow provisioning GPUs across all AZs, to prevent situation where all
+      // GPUs in a single AZ are in use and no new nodes can be spawned
+      availabilityZones: masterAzs,
+    },
+    {
+      instanceType: "g4dn.xlarge",
+      namePrefix: "gpu-prod",
+      labels+: { "2i2c/hub-name": "prod" },
+      tags+: {
+        "2i2c:hub-name": "prod",
+        "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
+      },
+      taints+: {
+        "nvidia.com/gpu": "present:NoSchedule"
+      },
+      // Allow provisioning GPUs across all AZs, to prevent situation where all
+      // GPUs in a single AZ are in use and no new nodes can be spawned
+      availabilityZones: masterAzs,
     },
 ];
 
@@ -53,7 +104,18 @@ local daskNodes = [
     // A not yet fully established policy is being developed about using a single
     // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
     //
-    { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }},
+    {
+        namePrefix: "dask-staging",
+        labels+: { "2i2c/hub-name": "staging" },
+        tags+: { "2i2c:hub-name": "staging" },
+        instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
+    },
+    {
+        namePrefix: "dask-prod",
+        labels+: { "2i2c/hub-name": "prod" },
+        tags+: { "2i2c:hub-name": "prod" },
+        instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
+    },
 ];
 
 
@@ -63,7 +125,7 @@ local daskNodes = [
     metadata+: {
         name: "victor",
         region: clusterRegion,
-        version: "1.29",
+        version: "1.30",
     },
     availabilityZones: masterAzs,
     iam: {
@@ -94,7 +156,7 @@ local daskNodes = [
     [
         ng + {
             namePrefix: 'core',
-            nameSuffix: 'b',
+            nameSuffix: 'a',
             nameIncludeInstanceType: false,
             availabilityZones: [nodeAz],
             ssh: {

diff --git a/terraform/aws/projects/victor.tfvars b/terraform/aws/projects/victor.tfvars
@@ -2,12 +2,16 @@ region                 = "us-west-2"
 cluster_name           = "victor"
 cluster_nodes_location = "us-west-2a"
 
+enable_aws_ce_grafana_backend_iam = true
+
 user_buckets = {
   "scratch-staging" : {
-    "delete_after" : 7
+    "delete_after" : 7,
+    "tags" : { "2i2c:hub-name" : "staging" }
   },
   "scratch" : {
-    "delete_after" : 7
+    "delete_after" : 7,
+    "tags" : { "2i2c:hub-name" : "prod" }
   },
 }