Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate JMTE hub to our existing infrastructure #2474

Merged
merged 6 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions config/clusters/jupyter-meets-the-earth/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: jupyter-meets-the-earth
yuvipanda marked this conversation as resolved.
Show resolved Hide resolved
provider: aws
aws:
key: enc-deployer-credentials.secret.json
clusterType: eks
clusterName: jupyter-meets-the-earth
region: us-west-2
support:
helm_chart_values_files:
- support.values.yaml
- enc-support.secret.values.yaml
hubs:
- name: staging
domain: staging.hub.jupytearth.org
helm_chart: daskhub
helm_chart_values_files:
- common.values.yaml
- staging.values.yaml
- enc-staging.secret.values.yaml
- name: prod
display_name: "Jupyter Meets the Earth"
domain: hub.jupytearth.org
yuvipanda marked this conversation as resolved.
Show resolved Hide resolved
helm_chart: daskhub
helm_chart_values_files:
- common.values.yaml
- prod.values.yaml
- enc-prod.secret.values.yaml
319 changes: 319 additions & 0 deletions config/clusters/jupyter-meets-the-earth/common.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
basehub:
nfs:
pv:
# from https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-nfs-mount-settings.html
mountOptions:
- rsize=1048576
- wsize=1048576
- timeo=600
- soft # We pick soft over hard, so NFS lockups don't lead to hung processes
- retrans=2
- noresvport
serverIP: fs-01707b06.efs.us-west-2.amazonaws.com
# This is different from rest of our hubs!
baseShareName: /

jupyterhub:
custom:
homepage:
templateVars:
org:
name: Jupyter meets the Earth
logo_url: https://pangeo-data.github.io/jupyter-earth/_static/jupyter-earth.png
url: https://jupytearth.org
designed_by:
name: 2i2c
url: https://2i2c.org
operated_by:
name: 2i2c
url: https://2i2c.org
funded_by:
name: Jupyter meets the Earth
url: https://jupytearth.org

singleuser:
extraFiles:
jupyter_server_config.json:
mountPath: /etc/jupyter/jupyter_notebook_config.json
data:
# Allow jupyterlab option to show hidden files in browser
# https://github.com/berkeley-dsep-infra/datahub/issues/3160
ContentsManager:
allow_hidden: true
initContainers:
# Need to explicitly set this up and copy what's in basehub/values.yaml
# as we have an extra 'shared-public' directory here.
- name: volume-mount-ownership-fix
image: busybox
command:
[
"sh",
"-c",
"id && chown 1000:1000 /home/jovyan /home/jovyan/shared /home/jovyan/shared-public && ls -lhd /home/jovyan",
]
securityContext:
runAsUser: 0
volumeMounts:
- name: home
mountPath: /home/jovyan
subPath: "{username}"
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
- name: home
mountPath: /home/jovyan/shared-public
subPath: _shared_public

# /dev/shm is mounted as a filesystem path, where writing to it means to
# write to memory.
#
# How to: https://stackoverflow.com/questions/46085748/define-size-for-dev-shm-on-container-engine/46434614#46434614
# Request for this by Ellie: https://fperezgroup.slack.com/archives/C020XCEFPEH/p1658168872788389
storage:
extraVolumes:
- name: dev-shm
emptyDir:
medium: Memory
extraVolumeMounts:
- name: dev-shm
mountPath: /dev/shm
# FIXME: we override the list extraVolumeMounts which is also set in
# the the basehub chart, due to that, we need to add this here
# as well. An option is to add hub.extraConfig entries that
# append the kubespawner configuration to include these extra
# volume mounts.
#
- name: home
mountPath: /home/jovyan/shared
subPath: _shared
readOnly: true
- name: home
mountPath: /home/jovyan/shared-public
subPath: _shared_public

# Increased as we have experienced a too slow image pull at least
# once. Our pods can take ~6-7 minutes to start on a new node it
# seems, so this gives us some margin.
startTimeout: 1200

extraEnv:
GH_SCOPED_CREDS_APP_URL: https://github.com/apps/hub-jupytearth-org-github-integ
GH_SCOPED_CREDS_CLIENT_ID: Iv1.a073b1649637af12

# FIXME: Until we can set this just for the GPU nodes, we need to set it for everyon
NVIDIA_DRIVER_CAPABILITIES: compute,utility

image:
# NOTE: We use the jupyterhub-configurator so this image/tag is not
# relevant. Visit its UI to configure the hub.
#
# staging: https://staging.hub.jupytearth.org/services/configurator/
# prod: https://hub.jupytearth.org/services/configurator/
pullPolicy: Always
name: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env
tag: "latest"

profileList:
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB"
default: True
description: "A shared machine, the recommended option until you experience a limitation."
kubespawner_override:
cpu_guarantee: 0.225
mem_guarantee: 0.875G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "4th of Medium: 1-4 CPU, 4-16 GB"
description: "A shared machine."
kubespawner_override:
cpu_guarantee: 0.875
mem_guarantee: 3.5G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Medium: 4 CPU, 16 GB"
description: "A dedicated machine for you."
kubespawner_override:
cpu_guarantee: 3.5
mem_guarantee: 14G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
- display_name: "Large: 16 CPU, 64 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 56G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.4xlarge
- display_name: "Massive: 64 CPU, 256 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 224G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: m5.16xlarge
- display_name: "Massive high-memory: 64 CPU, 976 GB"
description: "A dedicated machine for you."
kubespawner_override:
mem_guarantee: 900G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: x1.16xlarge
- display_name: "Medium GPU: 4 CPU, 16 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
cpu_guarantee: 3.5
mem_guarantee: 14G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "Large GPU: 16 CPU, 64 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
mem_guarantee: 56G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.4xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "Massive GPU: 64 CPU, 256 GB, 1 T4 Tensor Core GPU"
description: "A dedicated machine for you with one GPU attached."
kubespawner_override:
mem_guarantee: 224G
mem_limit: null
node_selector:
node.kubernetes.io/instance-type: g4dn.16xlarge
extra_resource_limits:
nvidia.com/gpu: "1"
- display_name: "16th of Medium: 0.25-4 CPU, 1-16 GB - Test of latest image"
description: "Helps us test an image before we make it the default"
kubespawner_override:
image: 286354552638.dkr.ecr.us-west-2.amazonaws.com/jmte/user-env:latest
image_pull_policy: Always
cpu_guarantee: 0.225
mem_guarantee: 0.875G
node_selector:
node.kubernetes.io/instance-type: m5.xlarge
mem_limit: null

hub:
config:
JupyterHub:
authenticator_class: cilogon
CILogonOAuthenticator:
scope:
- "profile"
username_claim: "preferred_username"
# Only show the option to login with GitHub
shown_idps:
- http://github.com/login/oauth/authorize
Authenticator:
allowed_users: &users
# This is just listing a few of the users/admins, a lot of
# users has been added manually, see:
# https://github.com/pangeo-data/jupyter-earth/issues/53
- abbyazari # Abby Azari
- andersy005 # Anderson Banihirwe
- consideratio # Erik Sundell
- choldgraf # Chris Holdgraf
- elliesch # Ellie Abrahams
- EMscience # Edom Moges
- espg # Shane Grigsby
- facusapienza21 # Facundo Sapienza
- fperez # Fernando Pérez
- kmpaul # Kevin Paul
- lrennels # Lisa Rennels
- mrsiegfried # Matthew Siegfried
- tsnow03 # Tasha Snow
- whyjz # Whyjay Zheng
- yuvipanda # Yuvi Panda
- jonathan-taylor # Jonathan Taylor
admin_users: *users
allowNamedServers: true

dask-gateway:
gateway:
backend:
scheduler:
# IMPORTANT: We have experienced that the scheduler can fail with
# 1GB memory limit. This was observed "stream closed"
# from the python client working against the
# Dask-Gateway created DaskCluster.
#
# CommClosedError: in <TLS (closed) ConnectionPool.gather local=tls://192.168.40.210:54296 remote=gateway://traefik-prod-dask-gateway.prod:80/prod.b9600f678bb747c1a5f038b5bef3eb90>: Stream is closed
#
cores:
request: 1
limit: 64
memory:
request: 2G
limit: 500G

# Note that we are overriding options provided in 2i2c's helm chart that has
# default values for these config entries.
#
extraConfig:
# This configuration represents options that can be presented to users
# that want to create a Dask cluster using dask-gateway. For more
# details, see https://gateway.dask.org/cluster-options.html
#
# The goal is to provide a simple configuration that allow the user some
# flexibility while also fitting well well on AWS nodes that are all
# having 1:4 ratio between CPU and GB of memory. By providing the
# username label, we help administrators to track user pods.
option_handler: |
from dask_gateway_server.options import Options, Select, String, Mapping
def cluster_options(user):
def option_handler(options):
if ":" not in options.image:
raise ValueError("When specifying an image you must also provide a tag")
extra_labels = {}
extra_annotations = {
"prometheus.io/scrape": "true",
"prometheus.io/port": "8787",
}
chosen_worker_cpu = int(options.worker_specification.split("CPU")[0])
chosen_worker_memory = 4 * chosen_worker_cpu
# We multiply the requests by a fraction to ensure that the
# worker fit well within a node that need some resources
# reserved for system pods.
return {
# A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
"image": options.image,
"scheduler_extra_pod_labels": extra_labels,
"scheduler_extra_pod_annotations": extra_annotations,
"worker_extra_pod_labels": extra_labels,
"worker_extra_pod_annotations": extra_annotations,
"worker_cores": 0.85 * chosen_worker_cpu,
"worker_cores_limit": chosen_worker_cpu,
"worker_memory": "%fG" % (0.85 * chosen_worker_memory),
"worker_memory_limit": "%fG" % chosen_worker_memory,
"environment": options.environment,
}
return Options(
Select(
"worker_specification",
[
"1CPU, 4GB",
"2CPU, 8GB",
"4CPU, 16GB",
"8CPU, 32GB",
"16CPU, 64GB",
"32CPU, 128GB",
"64CPU, 256GB",
],
default="1CPU, 4GB",
label="Worker specification",
),
# The default image is set via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable
String("image", label="Image"),
Mapping("environment", {}, label="Environment variables"),
handler=option_handler,
)
c.Backend.cluster_options = cluster_options
idle: |
# timeout after 30 minutes of inactivity
c.KubeClusterConfig.idle_timeout = 1800
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"AccessKey": {
"AccessKeyId": "ENC[AES256_GCM,data:A3+Abzcvq+I2hZq2u4coAYzNjvk=,iv:B4kPrUIM8nx/VTrEQI+tUxEySkDDe6eZHJqAJ9B4YcU=,tag:PtO2TdNEJsaYY0nQyvTHSw==,type:str]",
"SecretAccessKey": "ENC[AES256_GCM,data:gfFXGESHTJn6tiQUpMkpbpqNJJ43KxkNvYaH8V7sC5lRKUPl85Dw7w==,iv:krcKBzv/Wzu+jjtd9MJiTQvj6ELo2JHXird+mn0Vt5c=,tag:jv4YANW0drzpjpVekpmzqg==,type:str]",
"UserName": "ENC[AES256_GCM,data:8fWApCCT7IL+9E6t0FkRS3XTaHDL+XA=,iv:/rsHbqCvzulMvT6Jzj20zqfOb39ojUWprFbn8359ozA=,tag:Nc1L5ufStyZMOUxI8xVrzA==,type:str]"
},
"sops": {
"kms": null,
"gcp_kms": [
{
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
"created_at": "2023-04-07T13:38:22Z",
"enc": "CiUA4OM7eGDmmlUnGoSPNr9unRpxJ7GEcQ5/pXY2SrvhODPp9JWFEkkALQgViOWaFqYsRFv2FP6kqShPvabRqOC6KTPai4WGjiuK10rHIgiBbGNAfwQdenfi/vBU3h0rslaKojCN2qO4H+TAb4LG7eyO"
}
],
"azure_kv": null,
"hc_vault": null,
"age": null,
"lastmodified": "2023-04-07T13:38:23Z",
"mac": "ENC[AES256_GCM,data:HD/8swJpKnpElskOZXFjkJW6SjTIKChIZtHTqqlYexrj1x/HqrkLaGdHAuWIijZ91SOjxWlQxY67RzbpiJgdxG7XUcokrHqs+mEaWV65XVS087jucZo2tVC86wBFwNe4smlAEj6AF8n2gq/UAQbWoBE4fo3Vm/ojzhStqlLL0aQ=,iv:rrI6EO+c1LONQAHbsG7/TfEGlrrlKfzuriO+g29DFno=,tag:ZJqRJHVKlXOI+5S6cpsFtg==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.7.3"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
basehub:
jupyterhub:
hub:
config:
CILogonOAuthenticator:
client_id: ENC[AES256_GCM,data:pSlKv7EOrxXkt8Rhr2g1AmzK8r5chvTQNF9fcQclAiFJ4D4zrsgrKIAIMfBzK1qZaPqR,iv:bhyj+ytwpxHcpG782gwhxZ4T9qBYRuLzXF4kIczoM8w=,tag:QW7BUb87yxBLgQL2LaBelw==,type:str]
client_secret: ENC[AES256_GCM,data:DSoHfbfBHNroZ+c7+7BJIpUYfe5/RyuAj/uAgwCf51Q340WBow6X7nBSAKMTEgmzX2cOzYpjoWzQWXaM6IY21hmJJD9SjEJ1IY5kxmFqjahKGYvm9LM=,iv:Ze1SGQGcGw0mDQuoj7EVXnQTQAT/R/6T4/KH1n86orY=,tag:UfQQFNgUyyjcirl6ln52Sw==,type:str]
sops:
kms: []
gcp_kms:
- resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs
created_at: "2023-04-10T11:12:30Z"
enc: CiUA4OM7eITj8Go7yW8sabmF/ng6BAsGlrokT4eIYmff/PGx3HhhEkkALQgViBPoYsyjX82iMTE9MRaMY5Cp+4YwuGD9beVZCCRDZfyU9xkj+qHWH/6cr23FQ5iWmlmONLTeigXAWuV4nrGiQgqVAZuM
azure_kv: []
hc_vault: []
age: []
lastmodified: "2023-04-10T11:12:31Z"
mac: ENC[AES256_GCM,data:M/Q9+8zZjf4oX6WsAsXF/56WUrbce9QDEwYt6zWQ1yVZEujeIhmZ1Dw4OgEGeDGA8UdhGAlBH9hmpUtauQeMgXF3ajnO8S+5OscXsOueEAU8syNkGWpaI3r2U4ipm6ud76sf1juTPg7ia0wzLEb8kTx6qQvoNCyUYjB8Qoar42k=,iv:FiXQdIxf4sF06CWs5BctO47WvIe6bDy7rnmlbc1AlDc=,tag:NTAkSySc7Lsy4QhVVmPhMQ==,type:str]
pgp: []
unencrypted_suffix: _unencrypted
version: 3.7.3
Loading