Skip to content

Commit

Permalink
Initial commit of JMTE hub
Browse files Browse the repository at this point in the history
- Use 'jupyter-meets-the-earth' rather than jmte as name,
  because the existing cluster is already called 'jmte'.
- SFTP service is gone!
- Replicates config from
https://github.com/2i2c-org/infrastructure/pull/436/files
  to the extent possible
- Uses our IRSA config for AWS permissions, rather than the
  eksctl created service account in use earlier.
- Uses CILogon+GitHub for authentication, rather than auth0+github
- Re-use the same EFS filesystem from before, avoiding the need to
  copy a few terabytes of data around
- Hub is now at jmte.2i2c.cloud, and the old URL
  (hub.jupyterearth.org) redirects here. Same for staging.

Ref 2i2c-org#2201
  • Loading branch information
yuvipanda committed Apr 10, 2023
1 parent dba8b14 commit 5f14abb
Show file tree
Hide file tree
Showing 13 changed files with 793 additions and 0 deletions.
27 changes: 27 additions & 0 deletions config/clusters/jupyter-meets-the-earth/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: jupyter-meets-the-earth
provider: aws
aws:
key: enc-deployer-credentials.secret.json
clusterType: eks
clusterName: jupyter-meets-the-earth
region: us-west-2
support:
helm_chart_values_files:
- support.values.yaml
- enc-support.secret.values.yaml
hubs:
- name: staging
domain: staging.hub.jupytearth.org
helm_chart: daskhub
helm_chart_values_files:
- common.values.yaml
- staging.values.yaml
- enc-staging.secret.values.yaml
- name: prod
display_name: "Jupyter Meets the Earth"
domain: hub.jupytearth.org
helm_chart: daskhub
helm_chart_values_files:
- common.values.yaml
- prod.values.yaml
- enc-prod.secret.values.yaml
340 changes: 340 additions & 0 deletions config/clusters/jupyter-meets-the-earth/common.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
basehub:
nfs:
# enabled is adjusted by staging/prod values
# enabled: true
shareCreator:
enabled: true
pv:
serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
# mountOptions from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
mountOptions:
- rsize=1048576
- wsize=1048576
- timeo=600
- soft # We pick soft over hard, so NFS lockups don't lead to hung processes
- retrans=2
- noresvport
# baseShareName is required to be just "/" so that we can create
# various sub folders in the filesystem that our PV to access the
# NFS server can reference successfully as it isn't supported to
# access a not yet existing folder. This creation is automated by
# the nfs-share-creator resource part of the basehub Helm chart.
baseShareName: /

jupyterhub:
custom:
homepage:
templateVars:
org:
name: Jupyter meets the Earth
logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
url: https://jupytearth.org
designed_by:
name: 2i2c
url: https://2i2c.org
operated_by:
name: 2i2c
url: https://2i2c.org
funded_by:
name: Jupyter meets the Earth
url: https://jupytearth.org

singleuser:
# extraFiles ref: https://zero-to-jupyterhub.readthedocs.io/en/latest/resources/reference.html#singleuser-extrafiles
extraFiles:
jupyter_notebook_config.json:
mountPath: /etc/jupyter/jupyter_notebook_config.json
data:
# Allow jupyterlab option to show hidden files in browser
# https://github.com/berkeley-dsep-infra/datahub/issues/3160
ContentsManager:
allow_hidden: true
initContainers:
# Need to explicitly fix ownership here, since EFS doesn't do anonuid
- name: volume-mount-ownership-fix
image: busybox
command:
[
"sh",
"-c",
"id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan",
]
securityContext:
runAsUser: 0
volumeMounts:
- name: home
mountPath: /home/jovyan
subPath: "{username}"
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
- name: home
mountPath: /home/jovyan/shared-public
subPath: _shared_public

# /dev/shm is mounted as a filesystem path, where writing to it means to
# write to memory.
#
# How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614
# Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389
#
storage:
extraVolumes:
- name: dev-shm
emptyDir:
medium: Memory
extraVolumeMounts:
- name: dev-shm
mountPath: /dev/shm
# FIXME: we override the list extraVolumeMounts which is also set in
# the the basehub chart, due to that, we need to add this here
# as well. An option is to add hub.extraConfig entries that
# append the kubespawner configuration to include these extra
# volume mounts.
#
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
readOnly: true
- name: home
mountPath: /home/jovyan/shared-public
subPath: _shared_public

# Increased as we have experienced a too slow image pull at least
# once. Our pods can take ~6-7 minutes to start on a new node it
# seems, so this gives us some margin.
startTimeout: 1200

extraEnv:
GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12

# FIXME: Until we can set this just for the GPU nodes, we need to set it for everyon
NVIDIA_DRIVER_CAPABILITIES: compute,utility

image:
# NOTE: We use the jupyterhub-configurator so this image/tag is not
# relevant. Visit its UI to configure the hub.
#
# staging: https://staging.hub.jupytearth.org/services/configurator/
# prod: https://hub.jupytearth.org/services/configurator/
pullPolicy: Always
name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env
tag: "latest"

profileList:
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
default: True
description: "A shared machine, the recommended option until you experience a limitation."
kubespawner_override:
cpu_guarantee: 0.225
mem_guarantee: 0.875G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
description: "A shared machine."
kubespawner_override:
cpu_guarantee: 0.875
mem_guarantee: 3.5G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Medium: 4 CPU, 16 GB"
description: "A dedicated machine for you."
kubespawner_override:
cpu_guarantee: 3.5
mem_guarantee: 14G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Large: 16 CPU, 64 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 56G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.4xlarge
- display_name: "Massive: 64 CPU, 256 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 224G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.16xlarge
- display_name: "Massive high-memory: 64 CPU, 976 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 900G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: x1.16xlarge
- display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
cpu_guarantee: 3.5
mem_guarantee: 14G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
mem_guarantee: 56G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.4xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
mem_guarantee: 224G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.16xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
description: "Helps us test an image before we make it the default"
kubespawner_override:
image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest
image_pull_policy: Always
cpu_guarantee: 0.225
mem_guarantee: 0.875G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
mem_limit: null

hub:
config:
JupyterHub:
authenticator_class: cilogon
CILogonOAuthenticator:
scope:
- "profile"
username_claim: "preferred_username"
# Only show the option to login with GitHub
shown_idps:
- http://github.com/login/oauth/authorize
Authenticator:
allowed_users: &users
# This is just listing a few of the users/admins, a lot of
# users has been added manually, see:
# https://github.com/pangeo-data/jupyter-earth/issues/53
- abbyazari # Abby Azari
- andersy005 # Anderson Banihirwe
- consideratio # Erik Sundell
- choldgraf # Chris Holdgraf
- elliesch # Ellie Abrahams
- EMscience # Edom Moges
- espg # Shane Grigsby
- facusapienza21 # Facundo Sapienza
- fperez # Fernando Pérez
- kmpaul # Kevin Paul
- lrennels # Lisa Rennels
- mrsiegfried # Matthew Siegfried
- tsnow03 # Tasha Snow
- whyjz # Whyjay Zheng
- yuvipanda # Yuvi Panda
- jonathan-taylor # Jonathan Taylor
admin_users: *users
allowNamedServers: true

dask-gateway:
gateway:
backend:
scheduler:
# IMPORTANT: We have experienced that the scheduler can fail with
# 1GB memory limit. This was observed "stream closed"
# from the python client working against the
# Dask-Gateway created DaskCluster.
#
# CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
#
cores:
request: 1
limit: 64
memory:
request: 2G
limit: 500G
extraPodConfig:
nodeSelector:
hub.jupyter.org/node-purpose: user
k8s.dask.org/node-purpose: null
# serviceAccountName is adjusted by staging/prod values
# serviceAccountName: *user-sa
worker:
extraPodConfig:
nodeSelector:
k8s.dask.org/node-purpose: worker
# serviceAccountName is adjusted by staging/prod values
# serviceAccountName: *user-sa

# Note that we are overriding options provided in 2i2c's helm chart that has
# default values for these config entries.
#
extraConfig:
# This configuration represents options that can be presented to users
# that want to create a Dask cluster using dask-gateway. For more
# details, see https://gateway.dask.org/cluster-options.html
#
# The goal is to provide a simple configuration that allow the user some
# flexibility while also fitting well well on AWS nodes that are all
# having 1:4 ratio between CPU and GB of memory. By providing the
# username label, we help administrators to track user pods.
option_handler: |
from dask_gateway_server.options import Options, Select, String, Mapping
def cluster_options(user):
def option_handler(options):
if ":" not in options.image:
raise ValueError("When specifying an image you must also provide a tag")
extra_labels = {}
extra_annotations = {
"prometheus.io/scrape": "true",
"prometheus.io/port": "8787",
}
chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
chosen_worker_memory = 4 * chosen_worker_cpu
# We multiply the requests by a fraction to ensure that the
# worker fit well within a node that need some resources
# reserved for system pods.
return {
# A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
"image": options.image,
"scheduler_extra_pod_labels": extra_labels,
"scheduler_extra_pod_annotations": extra_annotations,
"worker_extra_pod_labels": extra_labels,
"worker_extra_pod_annotations": extra_annotations,
"worker_cores": 0.85 * chosen_worker_cpu,
"worker_cores_limit": chosen_worker_cpu,
"worker_memory": "%fG" % (0.85 * chosen_worker_memory),
"worker_memory_limit": "%fG" % chosen_worker_memory,
"environment": options.environment,
}
return Options(
Select(
"worker_specification",
[
"1CPU, 4GB",
"2CPU, 8GB",
"4CPU, 16GB",
"8CPU, 32GB",
"16CPU, 64GB",
"32CPU, 128GB",
"64CPU, 256GB",
],
default="1CPU, 4GB",
label="Worker specification",
),
# The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
String("image", label="Image"),
Mapping("environment", {}, label="Environment variables"),
handler=option_handler,
)
c.Backend.cluster_options = cluster_options
idle: |
# timeout after 30 minutes of inactivity
c.KubeClusterConfig.idle_timeout = 1800
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"AccessKey": {
"AccessKeyId": "ENC[AES256_GCM,data:A3+Abzcvq+I2hZq2u4coAYzNjvk=,iv:B4kPrUIM8nx/VTrEQI+tUxEySkDDe6eZHJqAJ9B4YcU=,tag:PtO2TdNEJsaYY0nQyvTHSw==,type:str]",
"SecretAccessKey": "ENC[AES256_GCM,data:gfFXGESHTJn6tiQUpMkpbpqNJJ43KxkNvYaH8V7sC5lRKUPl85Dw7w==,iv:krcKBzv/Wzu+jjtd9MJiTQvj6ELo2JHXird+mn0Vt5c=,tag:jv4YANW0drzpjpVekpmzqg==,type:str]",
"UserName": "ENC[AES256_GCM,data:8fWApCCT7IL+9E6t0FkRS3XTaHDL+XA=,iv:/rsHbqCvzulMvT6Jzj20zqfOb39ojUWprFbn8359ozA=,tag:Nc1L5ufStyZMOUxI8xVrzA==,type:str]"
},
"sops": {
"kms": null,
"gcp_kms": [
{
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
"created_at": "2023-04-07T13:38:22Z",
"enc": "CiUA4OM7eGDmmlUnGoSPNr9unRpxJ7GEcQ5/pXY2SrvhODPp9JWFEkkALQgViOWaFqYsRFv2FP6kqShPvabRqOC6KTPai4WGjiuK10rHIgiBbGNAfwQdenfi/vBU3h0rslaKojCN2qO4H+TAb4LG7eyO"
}
],
"azure_kv": null,
"hc_vault": null,
"age": null,
"lastmodified": "2023-04-07T13:38:23Z",
"mac": "ENC[AES256_GCM,data:HD/8swJpKnpElskOZXFjkJW6SjTIKChIZtHTqqlYexrj1x/HqrkLaGdHAuWIijZ91SOjxWlQxY67RzbpiJgdxG7XUcokrHqs+mEaWV65XVS087jucZo2tVC86wBFwNe4smlAEj6AF8n2gq/UAQbWoBE4fo3Vm/ojzhStqlLL0aQ=,iv:rrI6EO+c1LONQAHbsG7/TfEGlrrlKfzuriO+g29DFno=,tag:ZJqRJHVKlXOI+5S6cpsFtg==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.7.3"
}
}
Loading

0 comments on commit 5f14abb

Please sign in to comment.