From 520cf330f737d02942e65cf01de07ecb586e8fbc Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 1 Nov 2024 12:04:09 +0200 Subject: [PATCH 1/3] Add tags and labels for cost allocation --- eksctl/victor.jsonnet | 94 +++++++++++++++++++++++----- terraform/aws/projects/victor.tfvars | 6 +- 2 files changed, 82 insertions(+), 18 deletions(-) diff --git a/eksctl/victor.jsonnet b/eksctl/victor.jsonnet index d3e645ce2b..ef47109b21 100644 --- a/eksctl/victor.jsonnet +++ b/eksctl/victor.jsonnet @@ -25,20 +25,71 @@ local nodeAz = "us-west-2a"; // A `node.kubernetes.io/instance-type label is added, so pods // can request a particular kind of node with a nodeSelector local notebookNodes = [ - { instanceType: "r5.xlarge" }, - { instanceType: "r5.4xlarge" }, - { instanceType: "r5.16xlarge" }, { - instanceType: "g4dn.xlarge", - tags+: { - "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" - }, - taints+: { - "nvidia.com/gpu": "present:NoSchedule" - }, - // Allow provisioning GPUs across all AZs, to prevent situation where all - // GPUs in a single AZ are in use and no new nodes can be spawned - availabilityZones: masterAzs, + instanceType: "r5.xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { + "2i2c:hub-name": "staging", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, + }, + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { + "2i2c:hub-name": "prod", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, }, ]; @@ -53,7 +104,18 @@ local daskNodes = [ // A not yet fully established policy is being developed about using a single // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. // - { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }}, + { + namePrefix: "dask-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, + { + namePrefix: "dask-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, ]; @@ -63,7 +125,7 @@ local daskNodes = [ metadata+: { name: "victor", region: clusterRegion, - version: "1.29", + version: "1.30", }, availabilityZones: masterAzs, iam: { @@ -94,7 +156,7 @@ local daskNodes = [ [ ng + { namePrefix: 'core', - nameSuffix: 'b', + nameSuffix: 'a', nameIncludeInstanceType: false, availabilityZones: [nodeAz], ssh: { diff --git a/terraform/aws/projects/victor.tfvars b/terraform/aws/projects/victor.tfvars index 3282c67c6b..ab14da562b 100644 --- a/terraform/aws/projects/victor.tfvars +++ b/terraform/aws/projects/victor.tfvars @@ -4,10 +4,12 @@ cluster_nodes_location = "us-west-2a" user_buckets = { "scratch-staging" : { - "delete_after" : 7 + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "staging" } }, "scratch" : { - "delete_after" : 7 + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "prod" } }, } From 382edd654fcb5165b78e42d3e5a0f9a6730600cd Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 1 Nov 2024 12:17:11 +0200 Subject: [PATCH 2/3] Set node selectors on the user servers --- config/clusters/victor/prod.values.yaml | 2 ++ config/clusters/victor/staging.values.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/config/clusters/victor/prod.values.yaml b/config/clusters/victor/prod.values.yaml index 97ac23e474..576253c09f 100644 --- a/config/clusters/victor/prod.values.yaml +++ b/config/clusters/victor/prod.values.yaml @@ -14,6 +14,8 @@ basehub: CILogonOAuthenticator: oauth_callback_url: https://hub.victorproject.org/hub/oauth_callback singleuser: + nodeSelector: + 2i2c/hub-name: prod profileList: # IMPORTANT: Staging and prod's profileList's are meant to be kept # equivalent with the exception that staging adds diff --git a/config/clusters/victor/staging.values.yaml b/config/clusters/victor/staging.values.yaml index 39be389371..d2d0112dcb 100644 --- a/config/clusters/victor/staging.values.yaml +++ b/config/clusters/victor/staging.values.yaml @@ -14,6 +14,8 @@ basehub: CILogonOAuthenticator: oauth_callback_url: https://staging.hub.victorproject.org/hub/oauth_callback singleuser: + nodeSelector: + 2i2c/hub-name: staging profileList: # IMPORTANT: Staging and prod's profileList's are meant to be kept # equivalent with the exception that staging adds From 1b0f8f8511d930e9566bcd8626ee7918b306d5fa Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 1 Nov 2024 13:00:04 +0200 Subject: [PATCH 3/3] Enable aws grafana backend --- config/clusters/victor/support.values.yaml | 8 ++++++++ terraform/aws/projects/victor.tfvars | 2 ++ 2 files changed, 10 insertions(+) diff --git a/config/clusters/victor/support.values.yaml b/config/clusters/victor/support.values.yaml index 246a0e7f0e..c301a634d3 100644 --- a/config/clusters/victor/support.values.yaml +++ b/config/clusters/victor/support.values.yaml @@ -33,6 +33,14 @@ prometheus: hosts: - prometheus.victor.2i2c.cloud +aws-ce-grafana-backend: + enabled: true + envBasedConfig: + clusterName: victor + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::129856558350:role/aws_ce_grafana_backend_iam_role + redirects: rules: - from: victor.2i2c.cloud diff --git a/terraform/aws/projects/victor.tfvars b/terraform/aws/projects/victor.tfvars index ab14da562b..9a3c18a101 100644 --- a/terraform/aws/projects/victor.tfvars +++ b/terraform/aws/projects/victor.tfvars @@ -2,6 +2,8 @@ region = "us-west-2" cluster_name = "victor" cluster_nodes_location = "us-west-2a" +enable_aws_ce_grafana_backend_iam = true + user_buckets = { "scratch-staging" : { "delete_after" : 7,