diff --git a/config/clusters/victor/prod.values.yaml b/config/clusters/victor/prod.values.yaml index 97ac23e47..576253c09 100644 --- a/config/clusters/victor/prod.values.yaml +++ b/config/clusters/victor/prod.values.yaml @@ -14,6 +14,8 @@ basehub: CILogonOAuthenticator: oauth_callback_url: https://hub.victorproject.org/hub/oauth_callback singleuser: + nodeSelector: + 2i2c/hub-name: prod profileList: # IMPORTANT: Staging and prod's profileList's are meant to be kept # equivalent with the exception that staging adds diff --git a/config/clusters/victor/staging.values.yaml b/config/clusters/victor/staging.values.yaml index 39be38937..d2d0112dc 100644 --- a/config/clusters/victor/staging.values.yaml +++ b/config/clusters/victor/staging.values.yaml @@ -14,6 +14,8 @@ basehub: CILogonOAuthenticator: oauth_callback_url: https://staging.hub.victorproject.org/hub/oauth_callback singleuser: + nodeSelector: + 2i2c/hub-name: staging profileList: # IMPORTANT: Staging and prod's profileList's are meant to be kept # equivalent with the exception that staging adds diff --git a/config/clusters/victor/support.values.yaml b/config/clusters/victor/support.values.yaml index 246a0e7f0..c301a634d 100644 --- a/config/clusters/victor/support.values.yaml +++ b/config/clusters/victor/support.values.yaml @@ -33,6 +33,14 @@ prometheus: hosts: - prometheus.victor.2i2c.cloud +aws-ce-grafana-backend: + enabled: true + envBasedConfig: + clusterName: victor + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::129856558350:role/aws_ce_grafana_backend_iam_role + redirects: rules: - from: victor.2i2c.cloud diff --git a/eksctl/victor.jsonnet b/eksctl/victor.jsonnet index d3e645ce2..ef47109b2 100644 --- a/eksctl/victor.jsonnet +++ b/eksctl/victor.jsonnet @@ -25,20 +25,71 @@ local nodeAz = "us-west-2a"; // A `node.kubernetes.io/instance-type label is added, so pods // can request a particular kind of node with a nodeSelector local notebookNodes = [ - { instanceType: "r5.xlarge" }, - { instanceType: "r5.4xlarge" }, - { instanceType: "r5.16xlarge" }, { - instanceType: "g4dn.xlarge", - tags+: { - "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" - }, - taints+: { - "nvidia.com/gpu": "present:NoSchedule" - }, - // Allow provisioning GPUs across all AZs, to prevent situation where all - // GPUs in a single AZ are in use and no new nodes can be spawned - availabilityZones: masterAzs, + instanceType: "r5.xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" } + }, + { + instanceType: "r5.xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" } + }, + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { + "2i2c:hub-name": "staging", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, + }, + { + instanceType: "g4dn.xlarge", + namePrefix: "gpu-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { + "2i2c:hub-name": "prod", + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1" + }, + taints+: { + "nvidia.com/gpu": "present:NoSchedule" + }, + // Allow provisioning GPUs across all AZs, to prevent situation where all + // GPUs in a single AZ are in use and no new nodes can be spawned + availabilityZones: masterAzs, }, ]; @@ -53,7 +104,18 @@ local daskNodes = [ // A not yet fully established policy is being developed about using a single // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. // - { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }}, + { + namePrefix: "dask-staging", + labels+: { "2i2c/hub-name": "staging" }, + tags+: { "2i2c:hub-name": "staging" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, + { + namePrefix: "dask-prod", + labels+: { "2i2c/hub-name": "prod" }, + tags+: { "2i2c:hub-name": "prod" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, ]; @@ -63,7 +125,7 @@ local daskNodes = [ metadata+: { name: "victor", region: clusterRegion, - version: "1.29", + version: "1.30", }, availabilityZones: masterAzs, iam: { @@ -94,7 +156,7 @@ local daskNodes = [ [ ng + { namePrefix: 'core', - nameSuffix: 'b', + nameSuffix: 'a', nameIncludeInstanceType: false, availabilityZones: [nodeAz], ssh: { diff --git a/terraform/aws/projects/victor.tfvars b/terraform/aws/projects/victor.tfvars index 3282c67c6..9a3c18a10 100644 --- a/terraform/aws/projects/victor.tfvars +++ b/terraform/aws/projects/victor.tfvars @@ -2,12 +2,16 @@ region = "us-west-2" cluster_name = "victor" cluster_nodes_location = "us-west-2a" +enable_aws_ce_grafana_backend_iam = true + user_buckets = { "scratch-staging" : { - "delete_after" : 7 + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "staging" } }, "scratch" : { - "delete_after" : 7 + "delete_after" : 7, + "tags" : { "2i2c:hub-name" : "prod" } }, }