From 506f4653d67fae3d86dc604282e69c0eb47a3d02 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 15:58:38 +0200 Subject: [PATCH 1/6] Iterate over hubs for notebook nodes --- eksctl/Udder-Cranberry1-Various | 10 +++ eksctl/template.jsonnet | 25 +++++- eksctl/ubc-eoas copy.jsonnet | 136 ++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 3 deletions(-) create mode 100644 eksctl/Udder-Cranberry1-Various create mode 100644 eksctl/ubc-eoas copy.jsonnet diff --git a/eksctl/Udder-Cranberry1-Various b/eksctl/Udder-Cranberry1-Various new file mode 100644 index 000000000..1f5a57054 --- /dev/null +++ b/eksctl/Udder-Cranberry1-Various @@ -0,0 +1,10 @@ +Udder-Cranberry1-Various + + + +ghg: Gloater-Stash3-Vice + +A6yK$EJ7v#^vaLPhX3&hJ3&n9tKM^2ga + + +Semicolon6-Area \ No newline at end of file diff --git a/eksctl/template.jsonnet b/eksctl/template.jsonnet index a5ce2bdd2..6ae95a91c 100644 --- a/eksctl/template.jsonnet +++ b/eksctl/template.jsonnet @@ -36,10 +36,29 @@ local nodeAz = "<< cluster_region >>a"; // A `node.kubernetes.io/instance-type label is added, so pods // can request a particular kind of node with a nodeSelector local notebookNodes = [ - { instanceType: "r5.xlarge" }, - { instanceType: "r5.4xlarge" }, - { instanceType: "r5.16xlarge" }, +<% for hub in hubs %> + // << hub >> + { + instanceType: "r5.xlarge", + namePrefix: "nb-<< hub >>", + labels+: { "2i2c/hub-name": "<< hub >>" }, + tags+: { "2i2c:hub-name": "<< hub >>" }, + }, + { + instanceType: "r5.4xlarge", + namePrefix: "nb-<< hub >>", + labels+: { "2i2c/hub-name": "<< hub >>" }, + tags+: { "2i2c:hub-name": "<< hub >>" }, + }, + { + instanceType: "r5.16xlarge", + namePrefix: "nb-<< hub >>", + labels+: { "2i2c/hub-name": "<< hub >>" }, + tags+: { "2i2c:hub-name": "<< hub >>" }, + }, +<% endfor %> ]; + <% if hub_type == "daskhub" %> local daskNodes = [ // Node definitions for dask worker nodes. Config here is merged diff --git a/eksctl/ubc-eoas copy.jsonnet b/eksctl/ubc-eoas copy.jsonnet new file mode 100644 index 000000000..975f4d256 --- /dev/null +++ b/eksctl/ubc-eoas copy.jsonnet @@ -0,0 +1,136 @@ +/* + This file is a jsonnet template of a eksctl's cluster configuration file, + that is used with the eksctl CLI to both update and initialize an AWS EKS + based cluster. + + This file has in turn been generated from eksctl/template.jsonnet which is + relevant to compare with for changes over time. + + To use jsonnet to generate an eksctl configuration file from this, do: + + jsonnet ubc-eoas.jsonnet > ubc-eoas.eksctl.yaml + + References: + - https://eksctl.io/usage/schema/ +*/ +local ng = import "./libsonnet/nodegroup.jsonnet"; + +// place all cluster nodes here +local clusterRegion = "ca-central-1"; +local masterAzs = ["ca-central-1a", "ca-central-1b", "ca-central-1d"]; +local nodeAz = "ca-central-1a"; + +// Node definitions for notebook nodes. Config here is merged +// with our notebook node definition. +// A `node.kubernetes.io/instance-type label is added, so pods +// can request a particular kind of node with a nodeSelector +local notebookNodes = [ + { instanceType: "r5.xlarge" }, + { instanceType: "r5.2xlarge" }, + { instanceType: "r5.4xlarge" }, + { instanceType: "r5.16xlarge" }, +]; + +local daskNodes = []; + + +{ + apiVersion: 'eksctl.io/v1alpha5', + kind: 'ClusterConfig', + metadata+: { + name: "ubc-eoas", + region: clusterRegion, + version: "1.30", + }, + availabilityZones: masterAzs, + iam: { + withOIDC: true, + }, + // If you add an addon to this config, run the create addon command. + // + // eksctl create addon --config-file=ubc-eoas.eksctl.yaml + // + addons: [ + { + // aws-ebs-csi-driver ensures that our PVCs are bound to PVs that + // couple to AWS EBS based storage, without it expect to see pods + // mounting a PVC failing to schedule and PVC resources that are + // unbound. + // + // Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html + // + name: 'aws-ebs-csi-driver', + version: "latest", + wellKnownPolicies: { + ebsCSIController: true, + }, + }, + ], + nodeGroups: [ + n + {clusterName: $.metadata.name} for n in + [ + ng + { + namePrefix: 'core', + nameSuffix: 'a', + nameIncludeInstanceType: false, + availabilityZones: [nodeAz], + ssh: { + publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' + }, + instanceType: "m5.xlarge", + minSize: 1, + maxSize: 6, + labels+: { + "hub.jupyter.org/node-purpose": "core", + "k8s.dask.org/node-purpose": "core" + }, + tags+: { + "2i2c:node-purpose": "core" + }, + }, + ] + [ + ng + { + namePrefix: "nb", + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + instanceType: n.instanceType, + ssh: { + publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' + }, + labels+: { + "hub.jupyter.org/node-purpose": "user", + "k8s.dask.org/node-purpose": "scheduler" + }, + taints+: { + "hub.jupyter.org_dedicated": "user:NoSchedule", + "hub.jupyter.org/dedicated": "user:NoSchedule" + }, + } + n for n in notebookNodes + ] + ( if daskNodes != null then + [ + ng + { + namePrefix: "dask", + availabilityZones: [nodeAz], + minSize: 0, + maxSize: 500, + ssh: { + publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' + }, + labels+: { + "k8s.dask.org/node-purpose": "worker" + }, + taints+: { + "k8s.dask.org_dedicated" : "worker:NoSchedule", + "k8s.dask.org/dedicated" : "worker:NoSchedule" + }, + instancesDistribution+: { + onDemandBaseCapacity: 0, + onDemandPercentageAboveBaseCapacity: 0, + spotAllocationStrategy: "capacity-optimized", + }, + } + n for n in daskNodes + ] else [] + ) + ] +} From 2db8d7d07e02068d31b766d881bbfd176d117769 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 16:03:01 +0200 Subject: [PATCH 2/6] Tag each nodepool with a purpose tag --- eksctl/template.jsonnet | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/eksctl/template.jsonnet b/eksctl/template.jsonnet index 6ae95a91c..04909568c 100644 --- a/eksctl/template.jsonnet +++ b/eksctl/template.jsonnet @@ -164,6 +164,9 @@ local daskNodes = []; "hub.jupyter.org/node-purpose": "core", "k8s.dask.org/node-purpose": "core", }, + tags+: { + "2i2c:node-purpose": "core" + }, }, ] + [ ng + { @@ -183,6 +186,9 @@ local daskNodes = []; "hub.jupyter.org_dedicated": "user:NoSchedule", "hub.jupyter.org/dedicated": "user:NoSchedule", }, + tags+: { + "2i2c:node-purpose": "user" + }, } + n for n in notebookNodes ] + ( if daskNodes != null then [ @@ -201,6 +207,9 @@ local daskNodes = []; "k8s.dask.org_dedicated" : "worker:NoSchedule", "k8s.dask.org/dedicated" : "worker:NoSchedule", }, + tags+: { + "2i2c:node-purpose": "worker" + }, instancesDistribution+: { onDemandBaseCapacity: 0, onDemandPercentageAboveBaseCapacity: 0, From 5f494e7876b90251134cc5742d9293f8a471d8b5 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 16:27:39 +0200 Subject: [PATCH 3/6] Iterate over hubs for dask nodes --- eksctl/template.jsonnet | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/eksctl/template.jsonnet b/eksctl/template.jsonnet index 04909568c..aad355def 100644 --- a/eksctl/template.jsonnet +++ b/eksctl/template.jsonnet @@ -59,7 +59,7 @@ local notebookNodes = [ <% endfor %> ]; -<% if hub_type == "daskhub" %> +<% if dask_nodes %> local daskNodes = [ // Node definitions for dask worker nodes. Config here is merged // with our dask worker node definition, which uses spot instances. @@ -71,7 +71,14 @@ local daskNodes = [ // A not yet fully established policy is being developed about using a single // node pool, see https://github.com/2i2c-org/infrastructure/issues/2687. // - { instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }}, +<% for hub in hubs %> + { + namePrefix: "dask-<< hub >>", + labels+: { "2i2c/hub-name": "<< hub >>" }, + tags+: { "2i2c:hub-name": "<< hub >>" }, + instancesDistribution+: { instanceTypes: ["r5.4xlarge"] } + }, +<% endfor %> ]; <% else %> local daskNodes = []; From 6953b1c8dfdcce66cd2a764b528d5203257c77cc Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 16:41:47 +0200 Subject: [PATCH 4/6] Allow passing in a list of hubs and reenable dask check --- deployer/commands/generate/dedicated_cluster/aws.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/deployer/commands/generate/dedicated_cluster/aws.py b/deployer/commands/generate/dedicated_cluster/aws.py index 6aa86bb12..a033e6ef1 100644 --- a/deployer/commands/generate/dedicated_cluster/aws.py +++ b/deployer/commands/generate/dedicated_cluster/aws.py @@ -110,6 +110,14 @@ def aws( ..., prompt="The AWS account id or alias. Declare 2i2c for 2i2c's SSO based accounts and paid_by_us=true", ), + hubs: str = typer.Option( + "staging", + prompt="The list of hubs that will be deployed in the cluster separated by a comma. Example: staging, prod.", + ), + dask_nodes: bool = typer.Option( + False, + prompt='If this cluster needs dask nodes, please type "y", otherwise hit ENTER.', + ), force: bool = typer.Option( False, "--force", @@ -134,9 +142,12 @@ def aws( # Also store the provider, as it's useful for some jinja templates # to differentiate between them when rendering the configuration "provider": "aws", - "hub_type": "basehub", + "dask_nodes": dask_nodes, "cluster_name": cluster_name, "cluster_region": cluster_region, + "hubs": hubs.replace( + ",", " " + ).split(), # Convert the comma separated string to a list "sign_in_url": sign_in_url, "paid_by_us": str(paid_by_us).lower(), } From b92be1ca284008c9683b29fc480e8797e7461b07 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 16:51:52 +0200 Subject: [PATCH 5/6] Exclude template for check --- .pre-commit-config.yaml | 1 + config/clusters/templates/aws/cluster.yaml | 14 ++++++----- terraform/aws/projects/template.tfvars | 29 +++++++++++----------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc64c2547..93374ca97 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,6 +50,7 @@ repos: rev: v3.0.0 hooks: - id: terraform-fmt + exclude: terraform/aws/projects/template.tfvars # Prevent unencrypted files from being committed - repo: https://github.com/yuvipanda/pre-commit-hook-ensure-sops diff --git a/config/clusters/templates/aws/cluster.yaml b/config/clusters/templates/aws/cluster.yaml index c50bcec9a..e17d6b513 100644 --- a/config/clusters/templates/aws/cluster.yaml +++ b/config/clusters/templates/aws/cluster.yaml @@ -15,12 +15,14 @@ hubs: [] # Uncomment the lines below once the support infrastructure was deployed and # you are ready to add the first cluster - # - name: +{% for hub in hubs %} + # - name: {{ hub }} # # Tip: consider changing this to something more human friendly - # display_name: "{{ cluster_name }} - " - # domain: .{{ cluster_name }}.2i2c.cloud - # helm_chart: {{ hub_type }} + # display_name: "{{ cluster_name }} - {{ hub }}" + # domain: {{ hub }}.{{ cluster_name }}.2i2c.cloud + # helm_chart: basehub # helm_chart_values_files: # - common.values.yaml - # - .values.yaml - # - enc-.secret.values.yaml + # - {{ hub }}.values.yaml + # - enc-{{ hub }}.secret.values.yaml +{% endfor %} \ No newline at end of file diff --git a/terraform/aws/projects/template.tfvars b/terraform/aws/projects/template.tfvars index 028548f4a..b365c249d 100644 --- a/terraform/aws/projects/template.tfvars +++ b/terraform/aws/projects/template.tfvars @@ -9,26 +9,25 @@ cluster_nodes_location = "{{ cluster_region }}a" enable_aws_ce_grafana_backend_iam = true -# Tip: uncomment and fill the missing info in the lines below if you want +# Tip: uncomment and verify any missing info in the lines below if you want # to setup scratch buckets for the hubs on this cluster. # -#user_buckets = { -# "scratch-staging" : { -# "delete_after" : 7, -# "tags" : { "2i2c:hub-name" : "staging" }, -# }, -# # Tip: add more scratch buckets below, if this cluster will be multi-tenant -#} -# Tip: uncomment and fill the missing info in the lines below if you want +{% for hub in hubs %} +# "scratch-{{ hub }}" : { +# "delete_after" : 7, +# "tags" : { "2i2c:hub-name" : "{{ hub }}" }, +# }, +{% endfor %} + +# Tip: uncomment and verify any missing info in the lines below if you want # to setup specific cloud permissions for the buckets in this cluster. # -#hub_cloud_permissions = { -# "staging" : { +# hub_cloud_permissions = { +{% for hub in hubs %} +# "{{ hub }}" : { # "user-sa" : { -# bucket_admin_access : ["scratch-staging"], +# bucket_admin_access : ["scratch-{{ hub }}"], # }, # }, -# # Tip: add more namespaces below, if this cluster will be multi-tenant -#} - +{% endfor %} From fa086046c43c2f71e4c959e45bcb3105b22ae8d5 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 22 Nov 2024 16:53:02 +0200 Subject: [PATCH 6/6] Replace hub_type for dask_nodes now that there is no daskhub helm chart --- .../templates/common/cluster-entry.yaml | 2 +- config/clusters/templates/gcp/cluster.yaml | 2 +- deployer/README.md | 1 - .../generate/dedicated_cluster/gcp.py | 6 +- .../generate/hub_asset/cluster_entry.py | 1 - eksctl/Udder-Cranberry1-Various | 10 -- eksctl/ubc-eoas copy.jsonnet | 136 ------------------ .../gcp/projects/cluster.tfvars.template | 2 +- 8 files changed, 8 insertions(+), 152 deletions(-) delete mode 100644 eksctl/Udder-Cranberry1-Various delete mode 100644 eksctl/ubc-eoas copy.jsonnet diff --git a/config/clusters/templates/common/cluster-entry.yaml b/config/clusters/templates/common/cluster-entry.yaml index ca4642fb2..13008d601 100644 --- a/config/clusters/templates/common/cluster-entry.yaml +++ b/config/clusters/templates/common/cluster-entry.yaml @@ -2,7 +2,7 @@ hubs: - name: {{ hub_name }} display_name: {{ cluster_name }} {{ hub_name }} domain: {{ hub_name }}.{{ cluster_name }}.2i2c.cloud - helm_chart: {{ hub_type }} + helm_chart: "basehub" helm_chart_values_files: - common.values.yaml - {{ hub_name }}.values.yaml diff --git a/config/clusters/templates/gcp/cluster.yaml b/config/clusters/templates/gcp/cluster.yaml index 0f56cf3b6..470a647c9 100644 --- a/config/clusters/templates/gcp/cluster.yaml +++ b/config/clusters/templates/gcp/cluster.yaml @@ -31,7 +31,7 @@ hubs: [] # # Tip: consider changing this to something more human friendly # display_name: "{{ cluster_name }} - " # domain: .{{ cluster_name }}.2i2c.cloud - # helm_chart: {{ hub_type }} + # helm_chart: basehub # helm_chart_values_files: # - common.values.yaml # - .values.yaml diff --git a/deployer/README.md b/deployer/README.md index 141dabfa8..95b14fa0b 100644 --- a/deployer/README.md +++ b/deployer/README.md @@ -260,7 +260,6 @@ for a GCP cluster. - `cluster_name` - the name of the cluster - `cluster_region`- the region where the cluster will be deployed - `project_id` - the project ID of the GCP project - - `hub_type` (basehub/daskhub) - whether the hub deployed there would need dask or not - `hub_name` - the name of the first hub which will be deployed in the cluster (usually `staging`) The templates have a set of default features and define some opinionated characteristics for the cluster. diff --git a/deployer/commands/generate/dedicated_cluster/gcp.py b/deployer/commands/generate/dedicated_cluster/gcp.py index c5622061a..532769485 100644 --- a/deployer/commands/generate/dedicated_cluster/gcp.py +++ b/deployer/commands/generate/dedicated_cluster/gcp.py @@ -63,6 +63,10 @@ def gcp( project_id: str = typer.Option( ..., prompt="Please insert the Project ID of the GCP project" ), + dask_nodes: bool = typer.Option( + False, + prompt='If this cluster needs dask nodes, please type "y", otherwise hit ENTER.', + ), force: bool = typer.Option( False, "--force", @@ -79,7 +83,7 @@ def gcp( # Also store the provider, as it's useful for some jinja templates # to differentiate between them when rendering the configuration "provider": "gcp", - "hub_type": "basehub", + "dask_nodes": dask_nodes, "cluster_name": cluster_name, "cluster_region": cluster_region, "project_id": project_id, diff --git a/deployer/commands/generate/hub_asset/cluster_entry.py b/deployer/commands/generate/hub_asset/cluster_entry.py index 3f9d094bb..28132dc95 100644 --- a/deployer/commands/generate/hub_asset/cluster_entry.py +++ b/deployer/commands/generate/hub_asset/cluster_entry.py @@ -18,7 +18,6 @@ def cluster_entry( """ vars = { - "hub_type": "basehub", "cluster_name": cluster_name, "hub_name": hub_name, } diff --git a/eksctl/Udder-Cranberry1-Various b/eksctl/Udder-Cranberry1-Various deleted file mode 100644 index 1f5a57054..000000000 --- a/eksctl/Udder-Cranberry1-Various +++ /dev/null @@ -1,10 +0,0 @@ -Udder-Cranberry1-Various - - - -ghg: Gloater-Stash3-Vice - -A6yK$EJ7v#^vaLPhX3&hJ3&n9tKM^2ga - - -Semicolon6-Area \ No newline at end of file diff --git a/eksctl/ubc-eoas copy.jsonnet b/eksctl/ubc-eoas copy.jsonnet deleted file mode 100644 index 975f4d256..000000000 --- a/eksctl/ubc-eoas copy.jsonnet +++ /dev/null @@ -1,136 +0,0 @@ -/* - This file is a jsonnet template of a eksctl's cluster configuration file, - that is used with the eksctl CLI to both update and initialize an AWS EKS - based cluster. - - This file has in turn been generated from eksctl/template.jsonnet which is - relevant to compare with for changes over time. - - To use jsonnet to generate an eksctl configuration file from this, do: - - jsonnet ubc-eoas.jsonnet > ubc-eoas.eksctl.yaml - - References: - - https://eksctl.io/usage/schema/ -*/ -local ng = import "./libsonnet/nodegroup.jsonnet"; - -// place all cluster nodes here -local clusterRegion = "ca-central-1"; -local masterAzs = ["ca-central-1a", "ca-central-1b", "ca-central-1d"]; -local nodeAz = "ca-central-1a"; - -// Node definitions for notebook nodes. Config here is merged -// with our notebook node definition. -// A `node.kubernetes.io/instance-type label is added, so pods -// can request a particular kind of node with a nodeSelector -local notebookNodes = [ - { instanceType: "r5.xlarge" }, - { instanceType: "r5.2xlarge" }, - { instanceType: "r5.4xlarge" }, - { instanceType: "r5.16xlarge" }, -]; - -local daskNodes = []; - - -{ - apiVersion: 'eksctl.io/v1alpha5', - kind: 'ClusterConfig', - metadata+: { - name: "ubc-eoas", - region: clusterRegion, - version: "1.30", - }, - availabilityZones: masterAzs, - iam: { - withOIDC: true, - }, - // If you add an addon to this config, run the create addon command. - // - // eksctl create addon --config-file=ubc-eoas.eksctl.yaml - // - addons: [ - { - // aws-ebs-csi-driver ensures that our PVCs are bound to PVs that - // couple to AWS EBS based storage, without it expect to see pods - // mounting a PVC failing to schedule and PVC resources that are - // unbound. - // - // Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html - // - name: 'aws-ebs-csi-driver', - version: "latest", - wellKnownPolicies: { - ebsCSIController: true, - }, - }, - ], - nodeGroups: [ - n + {clusterName: $.metadata.name} for n in - [ - ng + { - namePrefix: 'core', - nameSuffix: 'a', - nameIncludeInstanceType: false, - availabilityZones: [nodeAz], - ssh: { - publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' - }, - instanceType: "m5.xlarge", - minSize: 1, - maxSize: 6, - labels+: { - "hub.jupyter.org/node-purpose": "core", - "k8s.dask.org/node-purpose": "core" - }, - tags+: { - "2i2c:node-purpose": "core" - }, - }, - ] + [ - ng + { - namePrefix: "nb", - availabilityZones: [nodeAz], - minSize: 0, - maxSize: 500, - instanceType: n.instanceType, - ssh: { - publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' - }, - labels+: { - "hub.jupyter.org/node-purpose": "user", - "k8s.dask.org/node-purpose": "scheduler" - }, - taints+: { - "hub.jupyter.org_dedicated": "user:NoSchedule", - "hub.jupyter.org/dedicated": "user:NoSchedule" - }, - } + n for n in notebookNodes - ] + ( if daskNodes != null then - [ - ng + { - namePrefix: "dask", - availabilityZones: [nodeAz], - minSize: 0, - maxSize: 500, - ssh: { - publicKeyPath: 'ssh-keys/ubc-eoas.key.pub' - }, - labels+: { - "k8s.dask.org/node-purpose": "worker" - }, - taints+: { - "k8s.dask.org_dedicated" : "worker:NoSchedule", - "k8s.dask.org/dedicated" : "worker:NoSchedule" - }, - instancesDistribution+: { - onDemandBaseCapacity: 0, - onDemandPercentageAboveBaseCapacity: 0, - spotAllocationStrategy: "capacity-optimized", - }, - } + n for n in daskNodes - ] else [] - ) - ] -} diff --git a/terraform/gcp/projects/cluster.tfvars.template b/terraform/gcp/projects/cluster.tfvars.template index 7ad5cd658..d521896ea 100644 --- a/terraform/gcp/projects/cluster.tfvars.template +++ b/terraform/gcp/projects/cluster.tfvars.template @@ -78,7 +78,7 @@ notebook_nodes = { } } -{% if hub_type == "daskhub" %} +{% if dask_nodes == "daskhub" %} dask_nodes = { # A not yet fully established policy is being developed about using a single # node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.