diff --git a/.github/workflows/quarto-docs.yml b/.github/workflows/quarto-docs.yml new file mode 100644 index 000000000..a4dfb9d76 --- /dev/null +++ b/.github/workflows/quarto-docs.yml @@ -0,0 +1,27 @@ +on: + workflow_dispatch: + push: + paths: + - 'docs/**' + +name: Quarto Publish + +jobs: + build-deploy: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Quarto + uses: quarto-dev/quarto-actions/setup@v2 + + - name: Render and Publish + uses: quarto-dev/quarto-actions/publish@v2 + with: + path: ./docs + target: gh-pages + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CNAME b/CNAME new file mode 100644 index 000000000..392499b2c --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +docs.datahub.berkeley.edu diff --git a/deployments/ugr01/config/common.yaml b/deployments/ugr01/config/common.yaml index f0343e413..9a897abec 100644 --- a/deployments/ugr01/config/common.yaml +++ b/deployments/ugr01/config/common.yaml @@ -9,9 +9,9 @@ jupyterhub: nodeSelector: hub.jupyter.org/pool-name: core-pool-2024-07-07 proxy: - chp: - nodeSelector: - hub.jupyter.org/pool-name: core-pool-2024-07-07 +# chp: +# nodeSelector: +# hub.jupyter.org/pool-name: core-pool-2024-07-07 service: type: LoadBalancer @@ -48,7 +48,7 @@ jupyterhub: # Unset NotebookApp from hub/values. Necessary for recent lab versions. JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp" nodeSelector: - hub.jupyter.org/pool-name: user-ugr01 + hub.jupyter.org/pool-name: ugr01-pool storage: type: static static: diff --git a/deployments/ugr01/config/staging.yaml b/deployments/ugr01/config/staging.yaml index 20f4cb24a..3f446e5ab 100644 --- a/deployments/ugr01/config/staging.yaml +++ b/deployments/ugr01/config/staging.yaml @@ -12,11 +12,18 @@ jupyterhub: proxy: service: loadBalancerIP: 34.172.42.174 - ingress: - enabled: true - hosts: - - ugr01-staging.datahub.berkeley.edu - tls: - - secretName: tls-cert - hosts: - - ugr01-staging.datahub.berkeley.edu + traefik: + extraInitContainers: + # This startup delay can help the k8s container network interface setup + # network routing of traffic to the pod, which is essential for the ACME + # challenge submitted by Traefik on startup to acquire a TLS certificate. + # + # Sleeping for 7 seconds has been consistently sufficient to avoid issues + # in GKE when this workaround was explored initially for GKE. + # + - name: startup-delay + image: busybox:stable + command: ["sh", "-c", "sleep 10"] + https: + hosts: + - ugr01-staging.datahub.berkeley.edu diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 298ea9e21..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/_quarto.yml b/docs/_quarto.yml new file mode 100644 index 000000000..3fa1bfaed --- /dev/null +++ b/docs/_quarto.yml @@ -0,0 +1,89 @@ +project: + type: website + +website: + navbar: + title: "DataHub" + search: true + logo: datahub.svg + background: light + tools: + - icon: github + href: https://github.com/berkeley-dsep-infra/datahub + left: + - text: "Using DataHub" + href: users/features.qmd + - text: "Contributing" + href: admins/pre-reqs.qmd + - text: "Admin Tasks" + href: admins/howto/preview-local.qmd + - text: "Policy" + href: policy/create_policy.qmd + page-navigation: true + sidebar: + style: docked + collapse-level: 1 + contents: + - href: index.qmd + text: Home + - section: "Using DataHub" + contents: + - users/services.qmd + - users/private-repo.qmd + - users/hubs.qmd + - users/authentication.qmd + - section: "Contributing to DataHub" + contents: + - admins/pre-reqs.qmd + - admins/structure.qmd + - admins/storage.qmd + - admins/cluster-config.qmd + - admins/credentials.qmd + - section: "Common Administrator Tasks" + contents: + - admins/howto/preview-local.qmd + - admins/howto/dns.qmd + - admins/howto/core-pool.qmd + - admins/howto/new-hub.qmd + - admins/howto/rebuild-hub-image.qmd + - admins/howto/rebuild-postgres-image.qmd + - admins/howto/new-image.qmd + - admins/howto/new-packages.qmd + - admins/howto/course-config.qmd + - admins/howto/calendar-scaler.qmd + - admins/howto/prometheus-grafana.qmd + - admins/howto/remove-users-orm.qmd + - admins/howto/delete-hub.qmd + - admins/howto/clusterswitch.qmd + - admins/howto/github-token.qmd + - admins/howto/google-sheets.qmd + - section: "Hub Deployments" + contents: + - admins/deployments/datahub.qmd + - admins/deployments/stat159.qmd + - section: "Policy" + contents: + - policy/create_policy.qmd + - policy/policy_create_hubs.qmd + - policy/policy_deploy_mainhubs.qmd + - policy/principles.qmd + - href: incidents/index.qmd + text: "Incident Reports" + +format: + html: + theme: book + #title-block-banner: datahub.svg + navbar: + left: + - "UC Berkeley DataHub Documentation" + right: + - icon: github + href: https://github.com/berkeley-dsep-infra/datahub + footer: + copyright: "2024, Division of Data Sciences Technical Staff" + nav: + - title: UC Berkeley DataHub + url: https://cdss.berkeley.edu/datahub + - title: Project Jupyter + url: https://jupyter.org diff --git a/docs/admins/cluster-config.qmd b/docs/admins/cluster-config.qmd new file mode 100644 index 000000000..c2d665fc1 --- /dev/null +++ b/docs/admins/cluster-config.qmd @@ -0,0 +1,204 @@ +--- +title: Kubernetes Cluster Configuration +--- + +We use [kubernetes](http://kubernetes.io/) to run our JupyterHubs. It +has a healthy open source community, managed offerings from multiple +vendors & a fast pace of development. We can run easily on many +different cloud providers with similar config by running on top of +Kubernetes, so it is also our cloud agnostic abstraction layer. + +We prefer using a managed Kubernetes service (such as [Google Kubernetes +Engine](https://cloud.google.com/kubernetes-engine/)). This document +lays out our preferred cluster configuration on various cloud providers. + +## Google Kubernetes Engine + +In our experience, Google Kubernetes Engine (GKE) has been the most +stable, performant, and reliable managed kubernetes service. We prefer +running on this when possible. + +A `gcloud container clusters create` command can succintly express the +configuration of our kubernetes cluster. The following command +represents the currently favored configuration. + +This creates the GKE cluster. It may host one or more node pools: + +``` bash +gcloud container clusters create \ + --enable-ip-alias \ + --enable-autoscaling \ + --max-nodes=20 --min-nodes=1 \ + --region=us-central1 --node-locations=us-central1-b \ + --image-type=cos_containerd \ + --disk-size=100 --disk-type=pd-balanced \ + --machine-type=n2-highmem-8 \ + --cluster-version latest \ + --no-enable-autoupgrade \ + --enable-network-policy \ + --create-subnetwork="" \ + --tags=hub-cluster \ + +``` + +Here\'s how we add a node pool to the cluster, beyond the default pool: + +``` bash +gcloud container node-pools create \ + --machine-type n2-highmem-8 \ + --num-nodes 1 \ + --enable-autoscaling \ + --min-nodes 1 --max-nodes 20 \ + --node-labels hub.jupyter.org/pool-name=-pool \ + --node-taints hub.jupyter.org_dedicated=user:NoSchedule \ + --region=us-central1 \ + --image-type=cos_containerd \ + --disk-size=200 --disk-type=pd-balanced \ + --no-enable-autoupgrade \ + --tags=hub-cluster \ + --cluster= \ + user----
+``` + +### IP Aliasing + +`--enable-ip-alias` creates [VPC Native +Clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/alias-ips). + +This becomes the default soon, and can be removed once it is the +default. + +### Autoscaling + +We use the [kubernetes cluster +autoscaler](https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler) +to scale our node count up and down based on demand. It waits until the +cluster is completely full before triggering creation of a new node - +but that\'s ok, since new node creation time on GKE is pretty quick. + +`--enable-autoscaling` turns the cluster autoscaler on. + +`--min-nodes` sets the minimum number of nodes that will be maintained +regardless of demand. This should ideally be 2, to give us some headroom +for quick starts without requiring scale ups when the cluster is +completely empty. + +`--max-nodes` sets the maximum number of nodes that the cluster +autoscaler will use - this sets the maximum number of concurrent users +we can support. This should be set to a reasonably high number, but not +too high - to protect against runaway creation of hundreds of VMs that +might drain all our credits due to accident or security breach. + +### Highly available master + +The kubernetes cluster\'s master nodes are managed by Google Cloud +automatically. By default, it is deployed in a non-highly-available +configuration - only one node. This means that upgrades and master +configuration changes cause a few minutes of downtime for the kubernetes +API, causing new user server starts / stops to fail. + +We request our cluster masters to have [highly available +masters](https://cloud.google.com/kubernetes-engine/docs/concepts/regional-clusters) +with `--region` parameter. This specifies the region where our 3 master +nodes will be spread across in different zones. It costs us extra, but +it is totally worth it. + +By default, asking for highly available masters also asks for 3x the +node count, spread across multiple zones. We don\'t want that, since all +our user pods have in-memory state & can\'t be relocated. Specifying +`--node-locations` explicitly lets us control how many and which zones +the nodes are located in. + +### Region / Zone selection + +We generally use the `us-central1` region and a zone in it for our +clusters -simply because that is where we have asked for +[quota](https://cloud.google.com/compute/quotas). + +There are regions closer to us, but latency hasn\'t really mattered so +we are currently still in us-central1. There are also unsubstantiated +rumors that us-central1 is their biggest data center and hence less +likely to run out of quota. + +### Disk Size + +`--disk-size` sets the size of the root disk on all the kubernetes +nodes. This isn\'t used for any persistent storage such as user home +directories. It is only used ephemerally for the operations of the +cluster - primarily storing docker images and other temporary storage. +We can make this larger if we use a large number of big images, or if we +want our image pulls to be faster (since disk performance [increases +with disk size](https://cloud.google.com/compute/docs/disks/performance) +). + +`--disk-type=pd-standard` gives us standard spinning disks, which are +cheaper. We can also request SSDs instead with `--disk-type=pd-ssd` - it +is much faster, but also much more expensive. We compromise with +`--disk-type=pd-balanced`, faster than spinning disks but not as fast as +ssds all the time. + +### Node size + +`--machine-type` lets us select how much [RAM and +CPU](https://cloud.google.com/compute/docs/machine-types) each of our +nodes have. For non-trivial hubs, we generally pick `n2-highmem-8`, with +64G of RAM and 8 cores. This is based on the following heuristics: + +1. Students generally are memory limited than CPU limited. In fact, + while we have a hard limit on memory use per-user pod, we do not + have a CPU limit -it hasn\'t proven necessary. +2. We try overprovision clusters by about 2x - so we try to fit about + 100G of total RAM use in a node with about 50G of RAM. This is + accomplished by setting the memory request to be about half of the + memory limit on user pods. This leads to massive cost savings, and + works out ok. +3. There is a kubernetes limit on 100 pods per node. + +Based on these heuristics, `n2-highmem-8` seems to be most bang for the +buck currently. We should revisit this for every cluster creation. + +### Cluster version + +GKE automatically upgrades cluster masters, so there is generally no +harm in being on the latest version available. + +### Node autoupgrades + +When node autoupgrades are enabled, GKE will automatically try to +upgrade our nodes whenever needed (our GKE version falling off the +support window, security issues, etc). However, since we run stateful +workloads, we *disable* this right now so we can do the upgrades +manually. + +### Network Policy + +Kubernetes [Network +Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/) +lets you firewall internal access inside a kubernetes cluster, +whitelisting only the flows you want. The JupyterHub chart we use +supports setting up appropriate NetworkPolicy objects it needs, so we +should turn it on for additional security depth. Note that any extra +in-cluster services we run *must* have a NetworkPolicy set up for them +to work reliabliy. + +### Subnetwork + +We put each cluster in its own subnetwork, since *seems* to be a limit +on how many clusters you can create in the same network with IP aliasing +on - you just run out of addresses. This also gives us some isolation - +subnetworks are isolated by default and can\'t reach other resources. +You must add [firewall +rules](https://cloud.google.com/vpc/docs/using-firewalls) to provide +access, including access to any manually run NFS servers. We add tags +for this. + +### Tags + +To help with firewalling, we add [network +tags](https://cloud.google.com/vpc/docs/add-remove-network-tags) to all +our cluster nodes. This lets us add firewall rules to control traffic +between subnetworks. + +### Cluster name + +We try use a descriptive name as much as possible. diff --git a/docs/admins/cluster-config.rst b/docs/admins/cluster-config.rst deleted file mode 100644 index 0c06b57d5..000000000 --- a/docs/admins/cluster-config.rst +++ /dev/null @@ -1,206 +0,0 @@ -.. _topic/cluster-config: - -================================ -Kubernetes Cluster Configuration -================================ - -We use `kubernetes `_ to run our JupyterHubs. It has -a healthy open source community, managed offerings from multiple vendors & -a fast pace of development. We can run easily on many different cloud -providers with similar config by running on top of Kubernetes, so it is also -our cloud agnostic abstraction layer. - -We prefer using a managed Kubernetes service (such as `Google Kubernetes Engine -`_). This document lays out our -preferred cluster configuration on various cloud providers. - -Google Kubernetes Engine -======================== - -In our experience, Google Kubernetes Engine (GKE) has been the most stable, -performant, and reliable managed kubernetes service. We prefer running on this -when possible. - -A ``gcloud container clusters create`` command can succintly express the -configuration of our kubernetes cluster. The following command represents -the currently favored configuration. - -This creates the GKE cluster. It may host one or more node pools: - -.. code:: bash - - gcloud container clusters create \ - --enable-ip-alias \ - --enable-autoscaling \ - --max-nodes=20 --min-nodes=1 \ - --region=us-central1 --node-locations=us-central1-b \ - --image-type=cos_containerd \ - --disk-size=100 --disk-type=pd-balanced \ - --machine-type=n2-highmem-8 \ - --cluster-version latest \ - --no-enable-autoupgrade \ - --enable-network-policy \ - --create-subnetwork="" \ - --tags=hub-cluster \ - - -Here's how we add a node pool to the cluster, beyond the default pool: - -.. code:: bash - - gcloud container node-pools create \ - --machine-type n2-highmem-8 \ - --num-nodes 1 \ - --enable-autoscaling \ - --min-nodes 1 --max-nodes 20 \ - --node-labels hub.jupyter.org/pool-name=-pool \ - --node-taints hub.jupyter.org_dedicated=user:NoSchedule \ - --region=us-central1 \ - --image-type=cos_containerd \ - --disk-size=200 --disk-type=pd-balanced \ - --no-enable-autoupgrade \ - --tags=hub-cluster \ - --cluster= \ - user----
- - -IP Aliasing ------------ - -``--enable-ip-alias`` creates `VPC Native Clusters `_. - -This becomes the default soon, and can be removed once it is the default. - -Autoscaling ------------ - -We use the `kubernetes cluster autoscaler `_ -to scale our node count up and down based on demand. It waits until the cluster is completely full -before triggering creation of a new node - but that's ok, since new node creation time on GKE is -pretty quick. - -``--enable-autoscaling`` turns the cluster autoscaler on. - -``--min-nodes`` sets the minimum number of nodes that will be maintained -regardless of demand. This should ideally be 2, to give us some headroom for -quick starts without requiring scale ups when the cluster is completely empty. - -``--max-nodes`` sets the maximum number of nodes that the cluster autoscaler -will use - this sets the maximum number of concurrent users we can support. -This should be set to a reasonably high number, but not too high - to protect -against runaway creation of hundreds of VMs that might drain all our credits -due to accident or security breach. - -Highly available master ------------------------ - -The kubernetes cluster's master nodes are managed by Google Cloud automatically. -By default, it is deployed in a non-highly-available configuration - only one -node. This means that upgrades and master configuration changes cause a few minutes -of downtime for the kubernetes API, causing new user server starts / stops to fail. - -We request our cluster masters to have `highly available masters `_ -with ``--region`` parameter. This specifies the region where our 3 master nodes -will be spread across in different zones. It costs us extra, but it is totally -worth it. - -By default, asking for highly available masters also asks for 3x the node count, -spread across multiple zones. We don't want that, since all our user pods have -in-memory state & can't be relocated. Specifying ``--node-locations`` explicitly -lets us control how many and which zones the nodes are located in. - -Region / Zone selection ------------------------ - -We generally use the ``us-central1`` region and a zone in it for our clusters - -simply because that is where we have asked for `quota `_. - -There are regions closer to us, but latency hasn't really mattered so we are -currently still in us-central1. There are also unsubstantiated rumors that us-central1 is their -biggest data center and hence less likely to run out of quota. - - -Disk Size ---------- - -``--disk-size`` sets the size of the root disk on all the kubernetes nodes. This -isn't used for any persistent storage such as user home directories. It is only -used ephemerally for the operations of the cluster - primarily storing docker -images and other temporary storage. We can make this larger if we use a large number -of big images, or if we want our image pulls to be faster (since disk performance -`increases with disk size `_ -). - -``--disk-type=pd-standard`` gives us standard spinning disks, which are cheaper. We -can also request SSDs instead with ``--disk-type=pd-ssd`` - it is much faster, -but also much more expensive. We compromise with ``--disk-type=pd-balanced``, faster -than spinning disks but not as fast as ssds all the time. - -Node size ---------- - -``--machine-type`` lets us select how much `RAM and CPU `_ -each of our nodes have. For non-trivial hubs, we generally pick ``n2-highmem-8``, with 64G -of RAM and 8 cores. This is based on the following heuristics: - -#. Students generally are memory limited than CPU limited. In fact, while we - have a hard limit on memory use per-user pod, we do not have a CPU limit - - it hasn't proven necessary. - -#. We try overprovision clusters by about 2x - so we try to fit about 100G of total RAM - use in a node with about 50G of RAM. This is accomplished by setting the memory - request to be about half of the memory limit on user pods. This leads to massive - cost savings, and works out ok. - -#. There is a kubernetes limit on 100 pods per node. - -Based on these heuristics, ``n2-highmem-8`` seems to be most bang for the buck -currently. We should revisit this for every cluster creation. - -Cluster version ---------------- - -GKE automatically upgrades cluster masters, so there is generally no harm in being -on the latest version available. - -Node autoupgrades ------------------ - -When node autoupgrades are enabled, GKE will automatically try to -upgrade our nodes whenever needed (our GKE version falling off the -support window, security issues, etc). However, since we run stateful -workloads, we *disable* this right now so we can do the upgrades -manually. - -Network Policy --------------- - -Kubernetes `Network Policy `_ -lets you firewall internal access inside a kubernetes cluster, whitelisting -only the flows you want. The JupyterHub chart we use supports setting up -appropriate NetworkPolicy objects it needs, so we should turn it on for -additional security depth. Note that any extra in-cluster services we run -*must* have a NetworkPolicy set up for them to work reliabliy. - -Subnetwork ----------- - -We put each cluster in its own subnetwork, since *seems* to be a limit on how -many clusters you can create in the same network with IP aliasing on - you -just run out of addresses. This also gives us some isolation - subnetworks -are isolated by default and can't reach other resources. You must add -`firewall rules `_ to -provide access, including access to any manually run NFS servers. -We add tags for this. - -Tags ----- - -To help with firewalling, we add `network tags `_ -to all our cluster nodes. This lets us add firewall rules to control traffic -between subnetworks. - -Cluster name ------------- - -We try use a descriptive name as much as possible. diff --git a/docs/admins/credentials.qmd b/docs/admins/credentials.qmd new file mode 100644 index 000000000..d16e39744 --- /dev/null +++ b/docs/admins/credentials.qmd @@ -0,0 +1,28 @@ +--- +title: Cloud Credentials +--- + +## Google Cloud + +### Service Accounts + +Service accounts are identified by a *service key*, and help us grant +specific access to an automated process. Our CI process needs two +service accounts to operate: + +1. A `gcr-readwrite` key. This is used to build and push the user + images. Based on the + [docs](https://cloud.google.com/container-registry/docs/access-control), + this is assigned the role `roles/storage.admin`. +2. A `gke` key. This is used to interact with the Google Kubernetes + cluster. Roles [roles/container.clusterViewer]{.title-ref} and + [roles/container.developer]{.title-ref} are granted to it. + +These are currently copied into the `secrets/` dir of every deployment, +and explicitly referenced from `hubploy.yaml` in each deployment. They +should be rotated every few months. + +You can [create service +accounts](https://nextjournal.com/schmudde/how-to-version-control-jupyter) +through the web console or the commandline. Remember to not leave around +copies of the private key elsewhere on your local computer! diff --git a/docs/admins/credentials.rst b/docs/admins/credentials.rst deleted file mode 100644 index 6263b693c..000000000 --- a/docs/admins/credentials.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _topic/credentials: - -================= -Cloud Credentials -================= - -Google Cloud -============ - -Service Accounts ----------------- - -Service accounts are identified by a *service key*, and help us grant -specific access to an automated process. Our CI process needs two service accounts to operate: - -#. A ``gcr-readwrite`` key. This is used to build and push the user images. - Based on the `docs `_, - this is assigned the role ``roles/storage.admin``. - -#. A ``gke`` key. This is used to interact with the Google Kubernetes cluster. - Roles `roles/container.clusterViewer` and `roles/container.developer` are - granted to it. - -These are currently copied into the ``secrets/`` dir of every deployment, and -explicitly referenced from ``hubploy.yaml`` in each deployment. They should -be rotated every few months. - -You can `create service accounts `_ -through the web console or the commandline. Remember to not leave around copies -of the private key elsewhere on your local computer! diff --git a/docs/admins/deployments/datahub.qmd b/docs/admins/deployments/datahub.qmd new file mode 100644 index 000000000..f0f4e3aea --- /dev/null +++ b/docs/admins/deployments/datahub.qmd @@ -0,0 +1,22 @@ +--- +title: DataHub +--- + +datahub.berkeley.edu provides standard computing environment to many +foundational courses across diverse disciplines. + +## Image + +The datahub image contains both Python and R environments. A user can +create jupyter notebooks utilizing either Python or R, or can run +RStudio using R or Python. + +The image is currently not based on repo2docker. + +## Resources + +A handful of courses have been granted elevated memory limits within the +hub configuration. + +CDSS staff and a small number of instructors have been given +administrative privileges. diff --git a/docs/admins/deployments/datahub.rst b/docs/admins/deployments/datahub.rst deleted file mode 100644 index ca80c239d..000000000 --- a/docs/admins/deployments/datahub.rst +++ /dev/null @@ -1,23 +0,0 @@ - -.. _deployments/datahub: - -======= -DataHub -======= - - -datahub.berkeley.edu provides standard computing environment to many foundational courses across diverse disciplines. - -Image -===== - -The datahub image contains both Python and R environments. A user can create jupyter notebooks utilizing either Python or R, or can run RStudio using R or Python. - -The image is currently not based on repo2docker. - -Resources -========= - -A handful of courses have been granted elevated memory limits within the hub configuration. - -CDSS staff and a small number of instructors have been given administrative privileges. diff --git a/docs/admins/deployments/index.qmd b/docs/admins/deployments/index.qmd new file mode 100644 index 000000000..d0bc7e663 --- /dev/null +++ b/docs/admins/deployments/index.qmd @@ -0,0 +1,3 @@ +--- +title: Hub Deployments +--- diff --git a/docs/admins/deployments/index.rst b/docs/admins/deployments/index.rst deleted file mode 100644 index 0385321e0..000000000 --- a/docs/admins/deployments/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -=============== -Hub Deployments -=============== - -.. toctree:: - :maxdepth: 1 - - datahub - stat159 diff --git a/docs/admins/deployments/stat159.qmd b/docs/admins/deployments/stat159.qmd new file mode 100644 index 000000000..f49ac96ac --- /dev/null +++ b/docs/admins/deployments/stat159.qmd @@ -0,0 +1,80 @@ +--- +title: Stat 159 +--- + +stat159.datahub.berkeley.edu is a course-specific hub for Stat 159 as +taught by Fernando Perez. It tends to include a lot of applications so +that students can shift their local development workflows to the cloud. + +## Image + +Notably the image contains support for RTC. As of March 2023, this +requires: + +```yaml +- altair==4.2.2 +- boken==2.4.3 +- dask==2023.1.1 +- jupyter_server==2.2.1 +- jupyterlab==3.6.1 +- jupyterlab_server==2.19.0 +- tornado==6.2.0 +- git+https:// +``` + +Some of these are hard requirements and others were necessary to make +conda happy. + +## Configuration + +Along with the dependencies, the singleuser server is modified to launch +as + +```yaml +singleuser: + cmd: + - jupyterhub-singleuser + - --LabApp.collaborative=true + # https://jupyterlab-realtime-collaboration.readthedocs.io/en/latest/configuration.html#configuration + - --YDocExtension.ystore_class=tmpystore.TmpYStore +``` + +This turns on collaboration and moves some sqlite storage from home directories to /tmp/. + +In addition to RTC, the hub also has configuration to enable [shared +accounts with +impersonation](https://github.com/jupyterhub/jupyterhub/blob/main/docs/source/tutorial/collaboration-users.md). +There are a handful of fabricated user accounts, e.g. collab-shared-1, +collab-shared-2, etc. not affiliated with any real person in bCourses. +There are also corresponding JupyterHub groups, shared-1, shared-2, etc. +The instructors add real students to the hub groups, and some roles and +scopes logic in the hub configuration gives students access to launch +jupyter servers for the collaborative user accounts. The logic is in +config/common.yaml while the current group affiliations are kept private +in secrets. + +This configuration is to encourage use of RTC, and to prevent one +student from having too much access to another student\'s home +directory. The fabricated (essentially service) accounts have initally +empty home directories and exist solely to provide workspaces for the +group. There is currently no archive or restore procedure in mind for +these shared accounts. + +For now, groups are defined in either the hub configuration or in the +administrative /hub/admin user interface. In order to enable group assignment +in this manner, we must set `Authenticator.managed_groups` to False. Ordinarily +groups are provided by CanvasAuthenticator where this setting is +True. + +Eventually instructors will be able to define groups in bCourses so that +CanvasAuthenticator can remain in charge of managing groups. This will +be important for the extremely large courses. It will also be beneficial +in that resource allocation can be performed more easily through group +affiliations and group properties. + +## Historical Information + +The image has been periodically shared with data100 for when Fernando +has taught both. Going forward, it is probably best to keep them +separate and optionally kept in sync. We don\'t want changes in one +course to come as a surprise to the other. diff --git a/docs/admins/deployments/stat159.rst b/docs/admins/deployments/stat159.rst deleted file mode 100644 index 3d0ed6723..000000000 --- a/docs/admins/deployments/stat159.rst +++ /dev/null @@ -1,56 +0,0 @@ - -.. _deployments/stat159: - -======== -Stat 159 -======== - - -stat159.datahub.berkeley.edu is a course-specific hub for Stat 159 as taught by Fernando Perez. It tends to include a lot of applications so that students can shift their local development workflows to the cloud. - -Image -===== - -Notably the image contains support for RTC. As of March 2023, this requires: - - - altair==4.2.2 - - boken==2.4.3 - - dask==2023.1.1 - - jupyter_server==2.2.1 - - jupyterlab==3.6.1 - - jupyterlab_server==2.19.0 - - tornado==6.2.0 - - git+https://github.com/berkeley-dsep-infra/tmpystore.git@84765e1 - -Some of these are hard requirements and others were necessary to make conda happy. - -Configuration -============= - -Along with the dependencies, the singleuser server is modified to launch as - -``` - singleuser: - cmd: - - jupyterhub-singleuser - - --LabApp.collaborative=true - # https://jupyterlab-realtime-collaboration.readthedocs.io/en/latest/configuration.html#configuration - - --YDocExtension.ystore_class=tmpystore.TmpYStore -``` - -This: - -1. Turns on collaboration. -2. Moves some sqlite storage from home directories to /tmp/. - -In addition to RTC, the hub also has configuration to enable `shared accounts with impersonation `_. There are a handful of fabricated user accounts, e.g. collab-shared-1, collab-shared-2, etc. not affiliated with any real person in bCourses. There are also corresponding JupyterHub groups, shared-1, shared-2, etc. The instructors add real students to the hub groups, and some roles and scopes logic in the hub configuration gives students access to launch jupyter servers for the collaborative user accounts. The logic is in config/common.yaml while the current group affiliations are kept private in secrets. - -This configuration is to encourage use of RTC, and to prevent one student from having too much access to another student's home directory. The fabricated (essentially service) accounts have initally empty home directories and exist solely to provide workspaces for the group. There is currently no archive or restore procedure in mind for these shared accounts. - -For now, groups are defined in either the hub configuration or in the administrative /hub/admin user interface. In order to enable group assignment in this manner, we must set `Authenticator.managed_groups` to False. Ordinarily groups are provided by CanvasAuthenticator where this setting is True. - -Eventually instructors will be able to define groups in bCourses so that CanvasAuthenticator can remain in charge of managing groups. This will be important for the extremely large courses. It will also be beneficial in that resource allocation can be performed more easily through group affiliations and group properties. - -Historical Information -====================== -The image has been periodically shared with data100 for when Fernando has taught both. Going forward, it is probably best to keep them separate and optionally kept in sync. We don't want changes in one course to come as a surprise to the other. diff --git a/docs/admins/howto/admin.rst b/docs/admins/howto/admin.rst deleted file mode 100644 index 5e617ad29..000000000 --- a/docs/admins/howto/admin.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _howto/admin: - -================= -Add an admin user -================= - -What can admin users do? -======================== - -JupyterHub has `admin users `_ -who have the following capabilities: - -#. Access & modify **all** other users' home directories (where all their work is kept) -#. Mark other users as admin users -#. Start / Stop other users' servers - -These are all powerful & disruptive capabilities, so be careful who gets admin access! - - -Adding / removing an admin user -=============================== - -#. Pick the :ref:`hub ` you want to make a user an admin of. -#. Find the :ref:`config directory ` for the hub, and - open ``common.yaml`` in there. -#. Add / remove the admin user name from the list ``jupyterhub.auth.admin.users``. - Make sure there is an explanatory comment nearby that lists *why* this user - is an admin. This helps us remove admins when they no longer need admin - access. -#. Follow the steps to make a deployment - \ No newline at end of file diff --git a/docs/admins/howto/calendar-scaler.qmd b/docs/admins/howto/calendar-scaler.qmd new file mode 100644 index 000000000..9ffcc6d46 --- /dev/null +++ b/docs/admins/howto/calendar-scaler.qmd @@ -0,0 +1,188 @@ +--- +title: Calendar Node Pool Autoscaler +--- + +## Why scale node pools with Google Calendar? + +The scheduler isn't perfect for us, especially when large classes have +assignments due and a hub is flooded with students. This "hack" was +introduced to improve cluster scaling prior to known events. + +These 'placeholder' nodes are used to minimize the delay that occurs +when GCP creates new node pools during mass user logins. This common, +especially for larger classes. + +## Structure + +There is a Google Calendar calendar, [DataHub Scaling +Events](https://calendar.google.com/calendar/embed?src=c_s47m3m1nuj3s81187k3b2b5s5o%40group.calendar.google.com&ctz=America%2FLos_Angeles) +shared with all infrastructure staff. The event descriptions should +contain a YAML fragment, and are of the form `pool_name: +count`, where the name is the corresponding hub name +(data100, stat20) and the count is the number of extra nodes you want. +There can be several pools defined, one per line. + +By default, we usually have one spare node ready to go, so if the count +in the calendar event is set to 0 or 1, there will be no change to the +cluster. If the value is set to >=2, additional hot spares will be +created. If a value is set more than once, the entry with the greater +value will be used. + +You can determine how many placeholder nodes to have up based on how +many people you expect to log in at once. Some of the bigger courses may +require 2 or more placeholder nodes, but during "regular" hours, 1 is +usually sufficient. + +The scaling mechanism is implemented as the +`node-placeholder-node-placeholder-scaler` deployment within +the `node-placeholder` namespace. The source code is within +. + +## Calendar Autoscaler + +The code for the calendar autoscaler is a python 3.11 script, located +here: + + +### How the scaler works + +There is a k8s pod running in the `node-placeholder` +namespace, which simply [runs python3 -m +scaler](https://github.com/berkeley-dsep-infra/datahub/blob/staging/images/node-placeholder-scaler/Dockerfile). +This script runs in an infinite loop, and every 60 seconds checks the +scaler config and calendar for entries. It then uses the highest value +provided as the number of placeholder replicas for any given hub. This +means that if there's a daily evening event to 'cool down' the number +of replicas for all hubs to 0, and a simultaneous event to set one or +more hubs to a higher number, the scaler will see this and keep however +many node placeholders specified up and ready to go. + +After determining the number of replicas needed for each hub, the scaler +will create a k8s template and run `kubectl` in the pod. + +### Updating the scaler config + +The [scaler +config](https://github.com/berkeley-dsep-infra/datahub/blob/staging/node-placeholder/values.yaml) +sets the default number of node-placeholders that are running at any +given time. These values can be overridden by creating events in the +[DataHub Scaling +Events](https://calendar.google.com/calendar/embed?src=c_s47m3m1nuj3s81187k3b2b5s5o%40group.calendar.google.com&ctz=America%2FLos_Angeles) +calendar. + +When classes are in session, these defaults are all typically set to +`1`, and during breaks (or when a hub is not expected to be +in use) they can be set to `0`. + +After making changes to `values.yaml`, create a PR normally +and our CI will push the new config out to the node-placeholder pod. +There is no need to manually restart the node-placeholder pod as the +changes will be picked up +[automatically](https://github.com/berkeley-dsep-infra/datahub/blob/3fb2d9412cbf87e0583774c8a7dc6c12ef58e715/images/node-placeholder-scaler/scaler/scaler.py#L93). + +### Working on, testing and deploying the calendar scaler + +All file locations in this section will assume that you are in the +`datahub/images/node-placeholder-scaler/` directory. + +It is strongly recommended that you create a new python 3.11 environment +before doing any dev work on the scaler. With `conda`, you +can run the following commands to create one: + +``` bash +conda create -ny scalertest python=3.11 +pip install -r requirements.txt +``` + +Any changes to the scaler code will require you to run +`chartpress` to redeploy the scaler to GCP. + +Here is an example of how you can test any changes to +`scaler/calendar.py` locally in the python interpreter: + +``` python +# these tests will use somes dates culled from the calendar with varying numbers of events. +import scaler.calendar +import datetime +import zoneinfo + +tz = zoneinfo.ZoneInfo(key='America/Los_Angeles') +zero_events_noon_june = datetime.datetime(2023, 6, 14, 12, 0, 0, tzinfo=tz) +one_event_five_pm_april = datetime.datetime(2023, 4, 27, 17, 0, 0, tzinfo=tz) +three_events_eight_thirty_pm_march = datetime.datetime(2023, 3, 6, 20, 30, 0, tzinfo=tz) +calendar = scaler.calendar.get_calendar('https://calendar.google.com/calendar/ical/c_s47m3m1nuj3s81187k3b2b5s5o%40group.calendar.google.com/public/basic.ics') +zero_events = scaler.calendar.get_events(calendar, time=zero_events_noon_june) +one_event = scaler.calendar.get_events(calendar, time=one_event_five_pm_april) +three_events = scaler.calendar.get_events(calendar, time=three_events_eight_thirty_pm_march) + +assert len(zero_events) == 0 +assert len(one_event) == 1 +assert len(three_events) == 3 +``` + +`get_events` returns a list of ical +`ical.event.Event` class objects. + +The method for testing `scaler/scaler.py` is similar to +above, but the only things you'll be able test locally are the +`make_deployment()` and `get_replica_counts()` +functions. + +When you're ready, create a PR. The deployment workflow is as follows: + +1. Get all authed-up for `chartpress` by performing the + steps listed + [here](https://docs.datahub.berkeley.edu/en/latest/admins/howto/rebuild-hub-image.html#). +2. Run `chartpress --push` from the root + `datahub/` directory. If this succeeds, check your `git + status` and add + `datahub/node-placeholder/Chart.yaml` and + `datahub/node-placeholder/values.yml` to your PR. +3. Merge to `staging` and then `prod`. + +### Changing python imports + +The python requirements file is generated using +`requirements.in` and `pip-compile`. If you need +to change/add/update any packages, you'll need to do the following: + +1. Ensure you have the correct python environment activated (see + above). +2. Pip install `pip-tools` +3. Edit `requirements.in` and save your changes. +4. Execute `pip-compile requirements.in`, which will update + the `requirements.txt`. +5. Check your git status and diffs, and create a pull request if + necessary. +6. Get all authed-up for `chartpress` by performing the + steps listed + [here](https://docs.datahub.berkeley.edu/en/latest/admins/howto/rebuild-hub-image.html#). +7. Run `chartpress --push` from the root + `datahub/` directory. If this succeeds, check your `git + status` and add + `datahub/node-placeholder/Chart.yaml` and + `datahub/node-placeholder/values.yml` to your PR. +8. Merge to `staging` and then `prod`. + +## Monitoring + +You can monitor the scaling by watching for events: + +``` bash +kubectl -n node-placeholder get events -w +``` + +And by tailing the logs of the pod with the scalar process: + +``` bash +kubectl -n node-placeholder logs -l app.kubernetes.io/name=node-placeholder-scaler -f +``` + +For example if you set `epsilon: 2`, you might see in the +pod logs: + +``` bash +2022-10-17 21:36:45,440 Found event Stat20/Epsilon test 2 2022-10-17 14:21 PDT to 15:00 PDT +2022-10-17 21:36:45,441 Overrides: {'epsilon': 2} +2022-10-17 21:36:46,475 Setting epsilon to have 2 replicas +``` diff --git a/docs/admins/howto/calendar-scaler.rst b/docs/admins/howto/calendar-scaler.rst deleted file mode 100644 index f56659397..000000000 --- a/docs/admins/howto/calendar-scaler.rst +++ /dev/null @@ -1,146 +0,0 @@ -.. _howto/calendar-scheduler: - -============================= -Calendar Node Pool Autoscaler -============================= - - -Why scale node pools with Google Calendar? -========================================== - -The scheduler isn't perfect for us, especially when large classes have assignments due and a hub is flooded with students. This "hack" was introduced to improve cluster scaling prior to known events. - -These 'placeholder' nodes are used to minimize the delay that occurs when GCP -creates new node pools during mass user logins. This common, especially for -larger classes. - -Structure -========= -There is a Google Calendar calendar, `DataHub Scaling Events `_ shared with all infrastructure staff. The event descriptions should contain a YAML fragment, and are of the form `pool_name: count`, where the name is the corresponding hub name (data100, stat20) and the count is the number of extra nodes you want. There can be several pools defined, one per line. - -By default, we usually have one spare node ready to go, so if the count in the calendar event is set to 0 or 1, there will be no change to the cluster. If the value is set to >=2, additional hot spares will be created. If a value is set more than once, the entry with the greater value will be used. - -You can determine how many placeholder nodes to have up based on how many people -you expect to log in at once. Some of the bigger courses may require 2 or more -placeholder nodes, but during 'regular' hours, 1 is usually sufficient. - -The scaling mechanism is implemented as the `node-placeholder-node-placeholder-scaler` deployment within the `node-placeholder` namespace. The source code is within https://github.com/berkeley-dsep-infra/datahub/tree/staging/images/node-placeholder-scaler. - -Calendar Autoscaler -=================== -The code for the calendar autoscaler is a python 3.11 script, located here: https://github.com/berkeley-dsep-infra/datahub/tree/staging/images/node-placeholder-scaler/scaler - -How the scaler works -******************** -There is a k8s pod running in the `node-placeholder` namespace, which simply -`runs python3 -m scaler `_. -This script runs in an infinite loop, and every -60 seconds checks the scaler config and calendar for entries. It then uses -the highest value provided as the number of placeholder replicas for any given -hub. This means that if there's a daily evening event to 'cool down' the number -of replicas for all hubs to 0, and a simultaneous event to set one or more hubs -to a higher number, the scaler will see this and keep however many node -placeholders specified up and ready to go. - -After determining the number of replicas needed for each hub, the scaler will -create a k8s template and run `kubectl` in the pod. - -Updating the scaler config -************************** -The `scaler config `_ -sets the default number of node-placeholders that are running at any given time. -These values can be overridden by creating events in the `DataHub Scaling Events `_ -calendar. - -When classes are in session, these defaults are all typically set to `1`, and -during breaks (or when a hub is not expected to be in use) they can be set to -`0`. - -After making changes to `values.yaml`, create a PR normally and our CI will -push the new config out to the node-placeholder pod. There is no need to -manually restart the node-placeholder pod as the changes will be picked up -`automatically `_. - -Working on, testing and deploying the calendar scaler -***************************************************** -All file locations in this section will assume that you are in the -`datahub/images/node-placeholder-scaler/` directory. - -It is strongly recommended that you create a new python 3.11 environment before -doing any dev work on the scaler. With `conda`, you can run the following -commands to create one: - - .. code:: bash - - conda create -ny scalertest python=3.11 - pip install -r requirements.txt - -Any changes to the scaler code will require you to run `chartpress` to redeploy -the scaler to GCP. - -Here is an example of how you can test any changes to `scaler/calendar.py` -locally in the python interpreter: - - .. code:: python - - # these tests will use somes dates culled from the calendar with varying numbers of events. - import scaler.calendar - import datetime - import zoneinfo - tz = zoneinfo.ZoneInfo(key='America/Los_Angeles') - zero_events_noon_june = datetime.datetime(2023, 6, 14, 12, 0, 0, tzinfo=tz) - one_event_five_pm_april = datetime.datetime(2023, 4, 27, 17, 0, 0, tzinfo=tz) - three_events_eight_thirty_pm_march = datetime.datetime(2023, 3, 6, 20, 30, 0, tzinfo=tz) - calendar = scaler.calendar.get_calendar('https://calendar.google.com/calendar/ical/c_s47m3m1nuj3s81187k3b2b5s5o%40group.calendar.google.com/public/basic.ics') - zero_events = scaler.calendar.get_events(calendar, time=zero_events_noon_june) - one_event = scaler.calendar.get_events(calendar, time=one_event_five_pm_april) - three_events = scaler.calendar.get_events(calendar, time=three_events_eight_thirty_pm_march) - assert len(zero_events) == 0 - assert len(one_event) == 1 - assert len(three_events) == 3 - -`get_events` returns a list of ical `ical.event.Event` class objects. - -The method for testing `scaler/scaler.py` is similar to above, but the only -things you'll be able test locally are the `make_deployment()` and `get_replica_counts()` functions. - -When you're ready, create a PR. The deployment workflow is as follows: - -#. Get all authed-up for `chartpress` by performing the steps listed `here `_. -#. Run `chartpress --push` from the root `datahub/` directory. If this succeeds, check your `git status` and add `datahub/node-placeholder/Chart.yaml` and `datahub/node-placeholder/values.yml` to your PR. -#. Merge to `staging` and then `prod`. - -Changing python imports -*********************** -The python requirements file is generated using `requirements.in` and `pip-compile`. If you need to change/add/update any packages, you'll need to do the following: - -#. Ensure you have the correct python environment activated (see above). -#. Pip install `pip-tools` -#. Edit `requirements.in` and save your changes. -#. Execute `pip-compile requirements.in`, which will update the `requirements.txt`. -#. Check your git status and diffs, and create a pull request if necessary. -#. Get all authed-up for `chartpress` by performing the steps listed `here `_. -#. Run `chartpress --push` from the root `datahub/` directory. If this succeeds, check your `git status` and add `datahub/node-placeholder/Chart.yaml` and `datahub/node-placeholder/values.yml` to your PR. -#. Merge to `staging` and then `prod`. - -Monitoring -========== -You can monitor the scaling by watching for events: - - .. code:: bash - - kubectl -n node-placeholder get events -w - -And by tailing the logs of the pod with the scalar process: - - .. code:: bash - - kubectl -n node-placeholder logs -l app.kubernetes.io/name=node-placeholder-scaler -f - -For example if you set `epsilon: 2`, you might see in the pod logs: - - .. code:: bash - - 2022-10-17 21:36:45,440 Found event Stat20/Epsilon test 2 2022-10-17 14:21 PDT to 15:00 PDT - 2022-10-17 21:36:45,441 Overrides: {'epsilon': 2} - 2022-10-17 21:36:46,475 Setting epsilon to have 2 replicas diff --git a/docs/admins/howto/clusterswitch.md b/docs/admins/howto/clusterswitch.qmd similarity index 87% rename from docs/admins/howto/clusterswitch.md rename to docs/admins/howto/clusterswitch.qmd index de1d16b8d..1a814051d 100644 --- a/docs/admins/howto/clusterswitch.md +++ b/docs/admins/howto/clusterswitch.qmd @@ -1,4 +1,6 @@ -# Switching over a hub to a new cluster +--- +title: Switching over a hub to a new cluster +--- This document describes how to switch an existing hub to a new cluster. The example used here refers to moving all UC Berkeley Datahubs. @@ -32,18 +34,24 @@ configured on the new cluster. Until this is done, `hubploy` and `helm` will fa 2. At this point, it's usually wise to upgrade `cert-manager` to the latest version found in the chart repo. You can find this by running the following command: - cert-manager-version=$(helm show all -n cert-manager jetstack/cert-manager | grep ^appVersion | awk '{print $2}') + ```bash + cert-manager-version=$(helm show all -n cert-manager jetstack/cert-manager | grep ^appVersion | awk '{print $2}') + ``` 3. Then, you can install the latest version of `cert-manager`: - kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${cert-manager-version}/cert-manager.yaml + ```bash + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${cert-manager-version}/cert-manager.yaml + ``` 4. Change the corresponding entry in `support/requirements.yaml` to `$cert-manager-version` and commit the changes (do not push). ## Create the node-placeholder k8s namespace The [calendar autoscaler](https://docs.datahub.berkeley.edu/en/latest/admins/howto/calendar-scaler.html) requires the `node-placeholder` namespace. Run the following command to create it: - kubectl create namespace node-placeholder +```bash +kubectl create namespace node-placeholder +``` ## Create a new static IP and switch DNS to point our new deployment at it. 1. Create a new static IP in the [GCP console](https://console.cloud.google.com/networking/addresses/add?project=ucb-datahub-2018). @@ -58,17 +66,23 @@ First, update any node pools in the configs to point to the new cluster. Typica Now we will manually deploy the `support` helm chart: - sops -d support/secrets.yaml > /tmp/secrets.yaml - helm install -f support/values.yaml -f /tmp/secrets.yaml -n support support support/ --set installCRDs=true --debug --create-namespace +```bash +sops -d support/secrets.yaml > /tmp/secrets.yaml +helm install -f support/values.yaml -f /tmp/secrets.yaml \ + -n support support support/ \ + --set installCRDs=true --debug --create-namespace +``` Before continuing, confirm via the GCP console that the IP that was defined in step 1 is now [bound to a forwarding rule](https://console.cloud.google.com/networking/addresses/list?project=ucb-datahub-2018). You can further confirm by listing the services in the [support chart](https://github.com/berkeley-dsep-infra/datahub/blob/staging/support/requirements.yaml) and making sure the ingress-controller is using the newly defined IP. One special thing to note: our `prometheus` instance uses a persistent volume that contains historical monitoring data. This is specified in `support/values.yaml`, under the `prometheus:` block: - persistentVolume: - size: 1000Gi - storageClass: ssd - existingClaim: prometheus-data-2024-05-15 +```yaml +persistentVolume: + size: 1000Gi + storageClass: ssd + existingClaim: prometheus-data-2024-05-15 +``` ## Manually deploy a hub to staging Finally, we can attempt to deploy a hub to the new cluster! Any hub will do, but we should start with a low-traffic hub (eg: https://dev.datahub.berkeley.edu). @@ -79,7 +93,9 @@ Second, update `hubploy.yaml` for this hub and point it to the new cluster you'v After this is done, add the changes to your feature branch (but don't push). After that, deploy a hub manually: - hubploy deploy dev hub staging +```bash +hubploy deploy dev hub staging +``` When the deploy is done, visit that hub and confirm that things are working. @@ -88,8 +104,10 @@ Now, update the remaining hubs' configs to point to the new node pools and `hubp Then use `hubploy` to deploy them to staging as with the previous step. The easiest way to do this is to have a list of hubs in a text file, and iterate over it with a `for` loop: - for x in $(cat hubs.txt); do hubploy deploy ${x} hub staging; done - for x in $(cat hubs.txt); do hubploy deploy ${x} hub prod; done +```bash +for x in $(cat hubs.txt); do hubploy deploy ${x} hub staging; done +for x in $(cat hubs.txt); do hubploy deploy ${x} hub prod; done +``` When done, add the modified configs to your feature branch (and again, don't push yet). @@ -112,4 +130,6 @@ FIN! After waiting a reasonable period of time (a day or two just to be cautious) and after fetching the usage logs, you may delete the old cluster: - gcloud container clusters delete ${OLDCLUSTER} --region=us-central1 +```bash +gcloud container clusters delete ${OLDCLUSTER} --region=us-central1 +``` diff --git a/docs/admins/howto/core-pool.qmd b/docs/admins/howto/core-pool.qmd new file mode 100644 index 000000000..a91f5eaed --- /dev/null +++ b/docs/admins/howto/core-pool.qmd @@ -0,0 +1,38 @@ +--- +title: Core Node Pool Management +--- + +## What is the core node pool? + +The core node pool is the primary entrypoint for all hubs we host. It +manages all incoming traffic, and redirects said traffic (via the nginx +ingress controller) to the proper hub. + +It also does other stuff. + +## Deploy a New Core Node Pool + +Run the following command from the root directory of your local datahub +repo to create the node pool: + +```bash +gcloud container node-pools create "core-" \ + --labels=hub=core,nodepool-deployment=core \ + --node-labels hub.jupyter.org/pool-name=core-pool- \ + --machine-type "n2-standard-8" \ + --num-nodes "1" \ + --enable-autoscaling --min-nodes "1" --max-nodes "3" \ + --project "ucb-datahub-2018" --cluster "spring-2024" \ + --region "us-central1" --node-locations "us-central1-b" \ + --tags hub-cluster \ + --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" \ + --metadata disable-legacy-endpoints=true \ + --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ + --no-enable-autoupgrade --enable-autorepair \ + --max-surge-upgrade 1 --max-unavailable-upgrade 0 --max-pods-per-node "110" \ + --system-config-from-file=vendor/google/gke/node-pool/config/core-pool-sysctl.yaml +``` + +The `system-config-from-file` argument is important, as we need to tune +the kernel TCP settings to handle large numbers of concurrent users and +keep nginx from using up all of the TCP ram. diff --git a/docs/admins/howto/core-pool.rst b/docs/admins/howto/core-pool.rst deleted file mode 100644 index a0daf007d..000000000 --- a/docs/admins/howto/core-pool.rst +++ /dev/null @@ -1,43 +0,0 @@ - -.. _howto/core-pool: - -================ -Creating and managing the core node pool -================ - - -What is the core node pool? ---------------------------- - -The core node pool is the primary entrypoint for all hubs we host. It manages -all incoming traffic, and redirects said traffic (via the nginx ingress -controller) to the proper hub. - -It also does other stuff. - - -Deploying a new core node pool ------------------------------- - -Run the following command from the root directory of your local datahub repo to create the node pool: - -.. code:: bash - - gcloud container node-pools create "core-" \ - --labels=hub=core,nodepool-deployment=core \ - --node-labels hub.jupyter.org/pool-name=core-pool- \ - --machine-type "n2-standard-8" \ - --num-nodes "1" \ - --enable-autoscaling --min-nodes "1" --max-nodes "3" \ - --project "ucb-datahub-2018" --cluster "spring-2024" --region "us-central1" --node-locations "us-central1-b" \ - --tags hub-cluster \ - --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" \ - --metadata disable-legacy-endpoints=true \ - --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ - --no-enable-autoupgrade --enable-autorepair --max-surge-upgrade 1 --max-unavailable-upgrade 0 --max-pods-per-node "110" \ - --system-config-from-file=vendor/google/gke/node-pool/config/core-pool-sysctl.yaml - - -The ``system-config-from-file`` argument is important, as we need to tune the -kernel TCP settings to handle large numbers of concurrent users and keep nginx -from using up all of the TCP ram. diff --git a/docs/admins/howto/course-config.qmd b/docs/admins/howto/course-config.qmd new file mode 100644 index 000000000..939e641d1 --- /dev/null +++ b/docs/admins/howto/course-config.qmd @@ -0,0 +1,230 @@ +--- +title: Course Configuration +--- + +## Allocating Resources + +It is possible to alter administrative priviliges or resources +allocations (such as memory or extra volumes) of user servers from +within the deployment configuration. This is mostly useful for when +resources need to be increased based on users\' class enrollments. The +hub must be configured to use the +[CanvasOAuthenticator](https://github.com/berkeley-dsep-infra/canvasauthenticator) +which is our default. Hubs that use dummy, Google, Generic OAuth, or +other authenticators are not configured to allocate additional resources +in this way. + +Additionally, it is also possible to allocate resources based on the +students membership of Canvas groups. This is useful if the instructor +wants to dynamically grant additional resources without CI round-trips. +Group management can be performed by the course staff directly from +bCourses. + +## Implementation + +The authenticator reads users Canvas enrollments when they login, and +then assigns them to JupyterHub groups based on those affiliations. +Groups are named with the format +\"course::{canvas_id}::enrollment_type::{canvas_role}\", e.g. +\"course::123456::enrollment_type::teacher\" or +\"course::234567::enrollment_type::student\". Our custom kubespawner, +which we define in `hub/values.yaml`, reads users\' group +memberships prior to spawning. It then overrides various KubeSpawner +paramters based on configuration we define, using the canvas ID as the +key. (see below) + +Note that if a user is assigned to a new Canvas group (e.g. by the +instructor manually, or by an automated Canvas/SIS system) while their +server is already running, they will need to logout and then log back in +in order for the authenticator to see the new affiliations. Restarting +the user server is not sufficient. + +The canvas ID is somewhat opaque to infrastructure staff \-- we cannot +look it up ourselves nor predict what it would be based on the name of +the course. This is why we must request it from the instructor. + +There are a number of other Canvas course attributes we could have +substituted for the ID, but all had various drawbacks. An SIS ID +attribute uses a consistent format that is relatively easy to predict, +however it is only exposed to instructor accounts on hub login. In +testing, when the Canvas admin configured student accounts to be able to +read the SIS ID, we discovered that other protected SIS attributes would +have been visible to all members of the course in the Canvas UI. Various +friendly name attributes (e.g. \"Statistics 123, Spring \'24\") were +inconsistent in structure or were modifiable by the instructor. So while +the Canvas ID is not predictable or easily discoverable by hub staff, it +is immutable and the instructor can find it in the URL for their course. + +## Assigning Scopes to Roles + +When JupyterHub only had two roles, admin and user, we would grant admin +rights to course staff. This enabled course staff to start, access, and +stop user servers, but it wasn\'t scoped to just the students in their +own course. It would give them access to the accounts of everyone on the +hub. They even had access to stop the hub process itself. JupyterHub now +lets us create our own roles and assign +[scopes](https://jupyterhub.readthedocs.io/en/stable/rbac/scopes.html) +to them. As a result, we can grant course staff the ability to do what +they need for members of their own course, and nothing more. + +Add the following configuration for course staff who need elevated +access: + +``` yaml +jupyterhub: + hub: + loadRoles: + # Data 123, Summer 2024, #9876 + course-staff-1234567: + description: Enable course staff to view and access servers. + # this role provides permissions to... + scopes: + - admin-ui + - list:users!group=course::1234567 + - admin:servers!group=course::1234567 + - access:servers!group=course::1234567 + # this role will be assigned to... + groups: + - course::1234567::enrollment_type::teacher + - course::1234567::enrollment_type::ta +``` + +This configuration is headed by a comment which describes the course and +term and links to the github issue where the staff made the request. It +defines a new role, `course-staff-1234567`, for a course +with bCourse ID `1234567`. It assigns scopes for accessing +and administering the servers for users in group +`course::1234567`. Members of that group include all +students and course staff. It also assigns scopes for viewing lists of +users at /hub/admin. It assignes these scopes to members of the +affiliated course staff groups. + +This stanza is more verbose than inserting lists of users under +`admin_users`, but it the privileges are more granular. We +don\'t need to know who the individual course staff and they won\'t have +more permissions than they need. + +The configuration causes JupyterHub to update information in its +`jupyterhub.sqlite` database file. When this configuraition +is removed, the hub does not automatically flush out the roles and +scopes from the database. So after the semester is over, it is advisable +to remove this configuration and also to flush out the information in +the database. There is no formal process for this, although we should +develop one. We can delete the database, or we can manually remove +entries from the sqlite file. + +## Defining group profiles + +1. Require course staff to request additional resources through a + [github issue](https://github.com/berkeley-dsep-infra/datahub/issues/new/choose). + +2. Obtain the bCourses course ID from the github issue. This ID is + found in the course's URL, e.g. + https://bcourses.berkeley.edu/courses/123456. It + should be a large integer. If the instructor requested resources for + a specific group within the course, obtain the group name. + +3. Edit `deployments/{deployment}/config/common.yaml`. + +4. Duplicate an existing stanza, or create a new one under + `jupyterhub.custom.group_profiles` by inserting yaml of + the form: + + ``` yaml + jupyterhub: + custom: + group_profiles: + + # Example: increase memory for everyone affiliated with a course. + # Name of Class 100, Fall '22; requested in #98765 + + course::123456: + mem_limit: 4096M + mem_guarantee: 2048M + + + # Example: increase memory just for course staff. + # Enrollment types returned by the Canvas API are `teacher`, + # `student`, `ta`, `observer`, and `designer`. (non-plural) + # https://canvas.instructure.com/doc/api/enrollments.html + + # Some other class 200, Spring '23; requested in #98776 + course::234567::enrollment_type::teacher: + mem_limit: 2096M + mem_guarantee: 2048M + course::234567::enrollment_type::ta: + mem_limit: 2096M + mem_guarantee: 2048M + + + # Example: a fully specified CanvasOAuthenticator group name where + # the resource request happens to be an additional mount path. + # Creating groups for temporary resource bumps could be useful + # where the instructor could add people to groups in the bCourses + # UI. This would benefit from the ability to read resource bumps + # from jupyterhub's properties. (attributes in the ORM) + + # Name of Class 100, Fall '22; requested in #98770 + course::123456::group::lab4-bigdata: + - mountPath: /home/rstudio/.ssh + name: home + subPath: _some_directory/_ssh + readOnly: true + ``` + + Our custom KubeSpawner knows to look for these values under + [jupyterhub.custom](https://z2jh.jupyter.org/en/stable/resources/reference.html#custom). + + `123456` and `234567` are bCourse course + identifiers from the first step. Memory limits and extra volume + mounts are specified as in the examples above. + +5. Add a comment associating the profile identifier with a friendly + name of the course. Also link to the github issue where the + instructor requested the resources. This helps us to cull old + configuration during maintenance windows. + +6. Commit the change, then ask course staff to verify the increased + allocation on staging. It is recommended that they simulate + completing a notebook or run through the assignment which requires + extra resources. + +## Defining user profiles + +It may be necessary to assign additional resources to specific users, if +it is too difficult to assign them to a bCourses group. + +1. Edit `deployments/{deployment}/config/common.yaml`. + +2. Duplicate an existing stanza, or create a new one under + `jupyterhub.custom.profiles` by inserting yaml of the + form: + + ``` yaml + jupyterhub: + custom: + profiles: + + # Example: increase memory for these specific users. + special_people: + # Requested in #87654. Remove after YYYY-MM-DD. + mem_limit: 2048M + mem_guarantee: 2048M + users: + - user1 + - user2 + ``` + +3. Add a comment which links to the github issue where the resources + were requested. This helps us to cull old configuration during + maintenance windows. + +## Housekeeping + +Group profiles should be removed at the end of every term because course +affiliations are not necessarily removed from each person\'s Canvas +account. So even if a user\'s class ended, the hub will grant additional +resources for as long as the config persisted in both Canvas and the +hub. + +User profiles should also be evaluated at the end of every term. diff --git a/docs/admins/howto/course-config.rst b/docs/admins/howto/course-config.rst deleted file mode 100644 index f3df14801..000000000 --- a/docs/admins/howto/course-config.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. _howto/course-config: - -==================== -Course Configuration -==================== - -Allocating Resources -==================== -It is possible to alter administrative priviliges or resources allocations (such as memory or extra volumes) of user servers from within the deployment configuration. This is mostly useful for when resources need to be increased based on users' class enrollments. The hub must be configured to use the `CanvasOAuthenticator `_ which is our default. Hubs that use dummy, Google, Generic OAuth, or other authenticators are not configured to allocate additional resources in this way. - -Additionally, it is also possible to allocate resources based on the students membership of Canvas groups. This is useful if the instructor wants to dynamically grant additional resources without CI round-trips. Group management can be performed by the course staff directly from bCourses. - -Implementation -============== -The authenticator reads users Canvas enrollments when they login, and then assigns them to JupyterHub groups based on those affiliations. Groups are named with the format "course::{canvas_id}::enrollment_type::{canvas_role}", e.g. "course::123456::enrollment_type::teacher" or "course::234567::enrollment_type::student". Our custom kubespawner, which we define in `hub/values.yaml`, reads users' group memberships prior to spawning. It then overrides various KubeSpawner paramters based on configuration we define, using the canvas ID as the key. (see below) - -Note that if a user is assigned to a new Canvas group (e.g. by the instructor manually, or by an automated Canvas/SIS system) while their server is already running, they will need to logout and then log back in in order for the authenticator to see the new affiliations. Restarting the user server is not sufficient. - -The canvas ID is somewhat opaque to infrastructure staff -- we cannot look it up ourselves nor predict what it would be based on the name of the course. This is why we must request it from the instructor. - -There are a number of other Canvas course attributes we could have substituted for the ID, but all had various drawbacks. An SIS ID attribute uses a consistent format that is relatively easy to predict, however it is only exposed to instructor accounts on hub login. In testing, when the Canvas admin configured student accounts to be able to read the SIS ID, we discovered that other protected SIS attributes would have been visible to all members of the course in the Canvas UI. Various friendly name attributes (e.g. "Statistics 123, Spring '24") were inconsistent in structure or were modifiable by the instructor. So while the Canvas ID is not predictable or easily discoverable by hub staff, it is immutable and the instructor can find it in the URL for their course. - -Assigning Scopes to Roles -========================= -When JupyterHub only had two roles, admin and user, we would grant admin rights to course staff. This enabled course staff to start, access, and stop user servers, but it wasn't scoped to just the students in their own course. It would give them access to the accounts of everyone on the hub. They even had access to stop the hub process itself. JupyterHub now lets us create our own roles and assign `scopes `_ to them. As a result, we can grant course staff the ability to do what they need for members of their own course, and nothing more. - -Add the following configuration for course staff who need elevated access: - - .. code:: yaml - - jupyterhub: - hub: - loadRoles: - # Data 123, Summer 2024, #9876 - course-staff-1234567: - description: Enable course staff to view and access servers. - # this role provides permissions to... - scopes: - - admin-ui - - list:users!group=course::1234567 - - admin:servers!group=course::1234567 - - access:servers!group=course::1234567 - # this role will be assigned to... - groups: - - course::1234567::enrollment_type::teacher - - course::1234567::enrollment_type::ta - -This configuration is headed by a comment which describes the course and term and links to the github issue where the staff made the request. It defines a new role, `course-staff-1234567`, for a course with bCourse ID `1234567`. It assigns scopes for accessing and administering the servers for users in group `course::1234567`. Members of that group include all students and course staff. It also assigns scopes for viewing lists of users at /hub/admin. It assignes these scopes to members of the affiliated course staff groups. - -This stanza is more verbose than inserting lists of users under `admin_users`, but it the privileges are more granular. We don't need to know who the individual course staff and they won't have more permissions than they need. - -The configuration causes JupyterHub to update information in its `jupyterhub.sqlite` database file. When this configuraition is removed, the hub does not automatically flush out the roles and scopes from the database. So after the semester is over, it is advisable to remove this configuration and also to flush out the information in the database. There is no formal process for this, although we should develop one. We can delete the database, or we can manually remove entries from the sqlite file. - -Defining group profiles -======================= - -#. Require course staff to request additional resources through a `github issue _`. - -#. Obtain the bCourses course ID from the github issue. This ID is found in the course's URL, e.g. `https://bcourses.berkeley.edu/courses/123456`. It should be a large integer. If the instructor requested resources for a specific group within the course, obtain the group name. - -#. Edit `deployments/{deployment}/config/common.yaml`. - -#. Duplicate an existing stanza, or create a new one under `jupyterhub.custom.group_profiles` by inserting yaml of the form: - - .. code:: yaml - - jupyterhub: - custom: - group_profiles: - - # Example: increase memory for everyone affiliated with a course. - # Name of Class 100, Fall '22; requested in #98765 - - course::123456: - mem_limit: 4096M - mem_guarantee: 2048M - - - # Example: increase memory just for course staff. - # Enrollment types returned by the Canvas API are `teacher`, - # `student`, `ta`, `observer`, and `designer`. (non-plural) - # https://canvas.instructure.com/doc/api/enrollments.html - - # Some other class 200, Spring '23; requested in #98776 - course::234567::enrollment_type::teacher: - mem_limit: 2096M - mem_guarantee: 2048M - course::234567::enrollment_type::ta: - mem_limit: 2096M - mem_guarantee: 2048M - - - # Example: a fully specified CanvasOAuthenticator group name where - # the resource request happens to be an additional mount path. - # Creating groups for temporary resource bumps could be useful - # where the instructor could add people to groups in the bCourses - # UI. This would benefit from the ability to read resource bumps - # from jupyterhub's properties. (attributes in the ORM) - - # Name of Class 100, Fall '22; requested in #98770 - course::123456::group::lab4-bigdata: - - mountPath: /home/rstudio/.ssh - name: home - subPath: _some_directory/_ssh - readOnly: true - - - Our custom KubeSpawner knows to look for these values under `jupyterhub.custom `_. - - `123456` and `234567` are bCourse course identifiers from the first step. Memory limits and extra volume mounts are specified as in the examples above. - -#. Add a comment associating the profile identifier with a friendly name of the course. Also link to the github issue where the instructor requested the resources. This helps us to cull old configuration during maintenance windows. - -#. Commit the change, then ask course staff to verify the increased allocation on staging. It is recommended that they simulate completing a notebook or run through the assignment which requires extra resources. - -Defining user profiles -====================== - -It may be necessary to assign additional resources to specific users, if it is too difficult to assign them to a bCourses group. - -#. Edit `deployments/{deployment}/config/common.yaml`. - -#. Duplicate an existing stanza, or create a new one under `jupyterhub.custom.profiles` by inserting yaml of the form: - - .. code:: yaml - - jupyterhub: - custom: - profiles: - - # Example: increase memory for these specific users. - special_people: - # Requested in #87654. Remove after YYYY-MM-DD. - mem_limit: 2048M - mem_guarantee: 2048M - users: - - user1 - - user2 - -#. Add a comment which links to the github issue where the resources were requested. This helps us to cull old configuration during maintenance windows. - -Housekeeping -============ - -Group profiles should be removed at the end of every term because course affiliations are not necessarily removed from each person's Canvas account. So even if a user's class ended, the hub will grant additional resources for as long as the config persisted in both Canvas and the hub. - -User profiles should also be evaluated at the end of every term. diff --git a/docs/admins/howto/delete-hub.qmd b/docs/admins/howto/delete-hub.qmd new file mode 100644 index 000000000..d61fbcb0f --- /dev/null +++ b/docs/admins/howto/delete-hub.qmd @@ -0,0 +1,50 @@ +--- +title: Delete or spin down a Hub +--- + +## Why delete or spin down a hub? + +Sometimes we want to spin down or delete a hub: + +1. A course or department won't be needing their hub for a while +2. The hub will be re-deployed in to a new or shared node pool. + +## Steps to spin down a hub + +If the hub is using a shared filestore, skip all filestore steps. + +If the hub is using a shared node pool, skip all namespace and node pool +steps. + +1. Scale the node pool to zero: + `kubectl -n scale --replicas=0 deployment/hub` +2. Kill any remaining users' servers. Find any running servers with + `kubectl -n get pods | grep jupyter` and then + `kubectl -n delete pod ` to stop + them. +3. Create filestore backup: +```bash +gcloud filestore backups create -backup-YYYY-MM-DD --file-share=shares --instance= --region "us-central1" --labels=filestore-backup=,hub= +``` +4. Log in to `nfsserver-01` and unmount filestore from nfsserver: + `sudo umount /export/-filestore` +5. Comment out the hub build steps out in `.circleci/config.yaml` + (deploy and build steps) +6. Comment out GitHub label action for this hub in + `.github/labeler.yml` +7. Comment hub entries out of `datahub/node-placeholder/values.yaml` +8. Delete k8s namespace: +```bash +kubectl delete namespace -staging -prod +``` +9. Delete k8s node pool: +```bash +gcloud container node-pools delete --project "ucb-datahub-2018" --cluster "spring-2024" --region "us-central1" +``` +10. Delete filestore +```bash +gcloud filestore instances delete -filestore --zone "us-central1-b" +``` +11. Delete PV: `kubectl get pv --all-namespaces|grep ` to get + the PV names, and then `kubectl delete pv ` +12. All done. diff --git a/docs/admins/howto/delete-hub.rst b/docs/admins/howto/delete-hub.rst deleted file mode 100644 index 4e814426a..000000000 --- a/docs/admins/howto/delete-hub.rst +++ /dev/null @@ -1,36 +0,0 @@ - -.. _howto/delete-hub: - -================ -Delete or spin down a Hub -================ - - -Why delete or spin down a hub? -===================== - -Sometimes we want to spin down or delete a hub: - -#. A course or department won't be needing their hub for a while -#. The hub will be re-deployed in to a new or shared node pool. - - - -Steps to spin down a hub: ------------- -If the hub is using a shared filestore, skip all filestore steps. - -If the hub is using a shared node pool, skip all namespace and node pool steps. - -#. Scale the node pool to zero: ``kubectl -n scale --replicas=0 deployment/hub`` -#. Kill any remaining users' servers. Find any running servers with ``kubectl -n get pods | grep jupyter`` and then ``kubectl -n delete pod `` to stop them. -#. Create filestore backup: ``gcloud filestore backups create -backup-YYYY-MM-DD --file-share=shares --instance= --region "us-central1" --labels=filestore-backup=,hub=`` -#. Log in to ``nfsserver-01`` and unmount filestore from nfsserver: ``sudo umount /export/-filestore`` -#. Comment out the hub build steps out in ``.circleci/config.yaml`` (deploy and build steps) -#. Comment out GitHub label action for this hub in ``.github/labeler.yml`` -#. Comment hub entries out of ``datahub/node-placeholder/values.yaml`` -#. Delete k8s namespace: ``kubectl delete namespace -staging -prod`` -#. Delete k8s node pool: ``gcloud container node-pools delete --project "ucb-datahub-2018" --cluster "spring-2024" --region "us-central1"`` -#. Delete filestore: ``gcloud filestore instances delete -filestore --zone "us-central1-b"`` -#. Delete PV: ``kubectl get pv --all-namespaces|grep `` to get the PV names, and then ``kubectl delete pv `` -#. All done. diff --git a/docs/admins/howto/dns.qmd b/docs/admins/howto/dns.qmd new file mode 100644 index 000000000..6919291ee --- /dev/null +++ b/docs/admins/howto/dns.qmd @@ -0,0 +1,53 @@ +--- +title: Update DNS +--- + +Some staff have access to make and update DNS entries in the +.datahub.berkeley.edu and .data8x.berkeley.edu subdomains. + +## Authorization + +Request access to make changes by creating an issue in this repository. + +Authorization is granted via membership in the +edu:berkeley:org:nos:DDI:datahub CalGroup. \@yuvipanda and \@ryanlovett +are group admins and can update membership. + +## Making Changes + +1. Log into [Infoblox](https://infoblox.net.berkeley.edu) from a campus + network or through the [campus + VPN](https://software.berkeley.edu/cisco-vpn). Use your CalNet + credentials. +2. Navigate to Data Management \> DNS \> Zones and click + `berkeley.edu`. +3. Navigate to Subzones and choose either data8x or datahub, then click + `Records`. + +:::{tip} +For quicker access, click the star next to the zone name to make a bookmark in the Finder pane on the left side. +::: + +### Create a new record + +1. Click the down arrow next to `+ Add` in the right-side Toolbar. Then + choose Record \> A Record. +2. Enter the name and IP of the A record, and uncheck + `Create associated PTR record`. +3. Consider adding a comment with a timestamp, your ID, and the nature + of the change. +4. Click `Save & Close`. + +### Edit an existing record + +1. Click the gear icon to the left of the record\'s name and choose + `Edit`. +2. Make a change. +3. Consider adding a comment with a timestamp, your ID, and the nature + of the change. +4. Click `Save & Close`. + +### Delete a record + +1. Click the gear icon to the left of the record\'s name and choose + `Delete`. diff --git a/docs/admins/howto/dns.rst b/docs/admins/howto/dns.rst deleted file mode 100644 index 393cdaa75..000000000 --- a/docs/admins/howto/dns.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. _howto/dns: - -========== -Update DNS -========== - -Some staff have access to make and update DNS entries in the .datahub.berkeley.edu and .data8x.berkeley.edu subdomains. - - -Authorization -============= - -Request access to make changes by creating an issue in this repository. - -Authorization is granted via membership in the edu:berkeley:org:nos:DDI:datahub CalGroup. @yuvipanda and @ryanlovett are group admins and can update membership. - -Making Changes -============== - -#. Log into `Infoblox `_ from a campus network - or through the `campus VPN `_. Use - your CalNet credentials. -#. Navigate to Data Management > DNS > Zones and click ``berkeley.edu``. -#. Navigate to Subzones and choose either data8x or datahub, then click - ``Records``. - -.. tip:: - - For quicker access, click the star next to the zone name to make a bookmark - in the ``Finder`` pane on the left side. - -Create a new record -------------------- -#. Click the down arrow next to ``+ Add`` in the right-side Toolbar. Then choose Record > A Record. -#. Enter the name and IP of the A record, and uncheck ``Create associated PTR record``. -#. Consider adding a comment with a timestamp, your ID, and the nature of the change. -#. Click ``Save & Close``. - -Edit an existing record ------------------------ -#. Click the gear icon to the left of the record's name and choose ``Edit``. -#. Make a change. -#. Consider adding a comment with a timestamp, your ID, and the nature of the change. -#. Click ``Save & Close``. - -Delete a record ----------------- -#. Click the gear icon to the left of the record's name and choose ``Delete``. diff --git a/docs/admins/howto/github-token.qmd b/docs/admins/howto/github-token.qmd new file mode 100644 index 000000000..b4ad1800c --- /dev/null +++ b/docs/admins/howto/github-token.qmd @@ -0,0 +1,13 @@ +--- +title: Create Finely Grained Access Token +--- + +At : + +1. Token name: set something descriptive. +2. Expiration: set the token to expire no earlier or later than + necessary. +3. Description: elaborate on the function of the token. +4. Resource owner: *berkeley-dsep-infra* +5. Repository access: Only selected repositories > *datahub* +6. Permissions: Contents > Access: Read and write diff --git a/docs/admins/howto/google-sheets.qmd b/docs/admins/howto/google-sheets.qmd new file mode 100644 index 000000000..45b8f8bdd --- /dev/null +++ b/docs/admins/howto/google-sheets.qmd @@ -0,0 +1,69 @@ +--- +title: Reading Google Sheets from DataHub +--- + +Available in: DataHub + +We provision and make available credentials for a [service +account](https://cloud.google.com/iam/docs/understanding-service-accounts) +that can be used to provide readonly access to Google Sheets. This is +useful in pedagogical situations where data is read from Google Sheets, +particularly with the [gspread](https://gspread.readthedocs.io/) +library. + +The entire contents of the JSON formatted service account key is +available as an environment variable `GOOGLE_SHEETS_READONLY_KEY`. You +can use this to read publicly available Google Sheet documents. + +The service account has no implicit permissions, and can be found under +`singleuser.extraEnv.GOOGLE_SHEETS_READONLY_KEY` in +`datahub/secrets/staging.yaml` and `datahub/secrets/prod.yaml`. + +## `gspread` sample code + +The following sample code reads a sheet from a URL given to it, and +prints the contents. + +``` python +import gspread +import os +import json +from oauth2client.service_account import ServiceAccountCredentials + +# Authenticate to Google +scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] +creds = ServiceAccountCredentials.from_json_keyfile_dict(json.loads(os.environ['GOOGLE_SHEETS_READONLY_KEY']), scope) +gc = gspread.authorize(creds) + +# Pick URL of Google Sheet to open +url = 'https://docs.google.com/spreadsheets/d/1SVRsQZWlzw9lV0MT3pWlha_VCVxWovqvu-7cb3feb4k/edit#gid=0' + +# Open the Google Sheet, and print contents of sheet 1 +sheet = gc.open_by_url(url) +print(sheet.sheet1.get_all_records()) +``` + +## `gspread-pandas` sample code + +The [gspread-pandas](https://gspread-pandas.readthedocs.io/) library +helps get data from Google Sheets into a +[pandas](https://pandas.pydata.org/) dataframe. + +``` python +from gspread_pandas.client import Spread +import os +import json +from oauth2client.service_account import ServiceAccountCredentials + +# Authenticate to Google +scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] +creds = ServiceAccountCredentials.from_json_keyfile_dict(json.loads(os.environ['GOOGLE_SHEETS_READONLY_KEY']), scope) + +# Pick URL of Google Sheet to open +url = 'https://docs.google.com/spreadsheets/d/1SVRsQZWlzw9lV0MT3pWlha_VCVxWovqvu-7cb3feb4k/edit#gid=0' + +# Open the Google Sheet, and print contents of sheet 1 as a dataframe +spread = Spread(url, creds=creds) +sheet_df = spread.sheet_to_df(sheet='sheet1') +print(sheet_df) +``` diff --git a/docs/admins/howto/index.qmd b/docs/admins/howto/index.qmd new file mode 100644 index 000000000..f6a765d36 --- /dev/null +++ b/docs/admins/howto/index.qmd @@ -0,0 +1,3 @@ +--- +title: Common Administrator Tasks +--- diff --git a/docs/admins/howto/index.rst b/docs/admins/howto/index.rst deleted file mode 100644 index c6a0dd6ea..000000000 --- a/docs/admins/howto/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -========================== -Common Administrator Tasks -========================== - -.. toctree:: - :maxdepth: 1 - - preview-local - admin - dns - core-pool - new-hub - rebuild-hub-image - new-image - new-packages - course-config - calendar-scaler - prometheus-grafana - remove-users-orm - delete-hub - clusterswitch.md diff --git a/docs/admins/howto/new-hub.qmd b/docs/admins/howto/new-hub.qmd new file mode 100644 index 000000000..e2781f338 --- /dev/null +++ b/docs/admins/howto/new-hub.qmd @@ -0,0 +1,414 @@ +--- +title: Create a New Hub +--- + +## Why create a new hub? + +The major reasons for making a new hub are: + +1. A new course wants to join the Berkeley DataHub community. +2. One of your *students* are course staff in another course and have *elevated access*, enabling them to see other students' work. +3. You want to use a different kind of authenticator. +4. You are running in a different cloud, or using a different billing + account. +5. Your environment is different enough and specialized enough that a + different hub is a good idea. By default, everyone uses the same + image as datahub.berkeley.edu. +6. You want a different URL (X.datahub.berkeley.edu vs just + datahub.berkeley.edu) + +Please let us know if you have some other justification for creating a new hub. + +## Prerequisites + +Working installs of the following utilities: + + - [sops](https://github.com/mozilla/sops/releases) + - [hubploy](https://hubploy.readthedocs.io/en/latest/index.html) + - [gcloud](https://cloud.google.com/sdk/docs/install) + - [kubectl](https://kubernetes.io/docs/tasks/tools/) + - [cookiecutter](https://github.com/audreyr/cookiecutter) + +Proper access to the following systems: + + - Google Cloud IAM: *owner* + - Write access to the [datahub repo](https://github.com/berkeley-dsep-infra/datahub) + - CircleCI account linked to our GitHub organization. + +## Configuring a New Hub + +### Name the hub + +Choose the hub name, e.g. *data8*, *stat20*, *biology*, *julia*, which is typically the name of the course or department. This is permanent. + +### Determine deployment needs + +Before creating a new hub, have a discussion with the instructor about +the system requirements, frequency of assignments and how much storage +will be required for the course. Typically, there are three general +"types" of hub: Heavy usage, general and small courses. + +Small courses will usually have one or two assignments per semester, and +may only have 20 or fewer users. + +General courses have up to \~500 users, but don't have large amount of +data or require upgraded compute resources. + +Heavy usage courses can potentially have thousands of users, require +upgraded node specs and/or have Terabytes of data each semester. + +Both general and heavy usage courses typically have weekly assignments. + +Small courses (and some general usage courses) can use either or both of +a shared node pool and filestore to save money (Basic HDD filestore +instances start at 1T). + +This is also a good time to determine if there are any specific software +packages/libraries that need to be installed, as well as what +language(s) the course will be using. This will determine which image to +use, and if we will need to add additional packages to the image build. + +If you're going to use an existing node pool and/or filestore instance, +you can skip either or both of the following steps and pick back up at +the `cookiecutter`. + +When creating a new hub, we also make sure to label the filestore and +GKE/node pool resouces with both `hub` and +`-deployment`. 99.999% of the time, the values for +all three of these labels will be ``. + +### Creating a new node pool + +Create the node pool: + +``` bash +gcloud container node-pools create "user--" \ + --labels=hub=,nodepool-deployment= \ + --node-labels hub.jupyter.org/pool-name=-pool \ + --machine-type "n2-highmem-8" \ + --enable-autoscaling --min-nodes "0" --max-nodes "20" \ + --project "ucb-datahub-2018" --cluster "spring-2024" \ + --region "us-central1" --node-locations "us-central1-b" \ + --node-taints hub.jupyter.org_dedicated=user:NoSchedule --tags hub-cluster \ + --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "200" \ + --metadata disable-legacy-endpoints=true \ + --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ + --no-enable-autoupgrade --enable-autorepair \ + --max-surge-upgrade 1 --max-unavailable-upgrade 0 --max-pods-per-node "110" +``` + +### Creating a new filestore instance + +Before you create a new filestore instance, be sure you know the +capacity required. The smallest amount you can allocate is 1T, but +larger hubs may require more. Confer with the admins and people +instructing the course and determine how much they think they will need. + +We can easily scale capacity up, but not down. + +From the command line, first fill in the instance name +(`-`) and ``, and then execute the +following command: + +``` bash +gcloud filestore instances create - \ + --zone "us-central1-b" --tier="BASIC_HDD" \ + --file-share=capacity=1TiB,name=shares \ + --network=name=default,connect-mode=DIRECT_PEERING +``` + +Or, from the web console, click on the horizontal bar icon at the top +left corner. + +1. Access "Filestore" > "Instances" and click on "Create Instance". +2. Name the instance `-` +3. Instance Type is `Basic`, Storage Type is `HDD`. +4. Allocate capacity. +5. Set the region to `us-central1` and Zone to `us-central1-b`. +6. Set the VPC network to `default`. +7. Set the File share name to `shares`. +8. Click "Create" and wait for it to be deployed. +9. Once it's deployed, select the instance and copy the "NFS mount + point". + +Your new (but empty) NFS filestore must be seeded with a pair of +directories. We run a utility VM for NFS filestore management; follow +the steps below to connect to this utility VM, mount your new filestore, +and create & configure the required directories. + +You can run the following command in gcloud terminal to log in to the +NFS utility VM: + +```bash +gcloud compute ssh nfsserver-01 --zone=us-central1-b +``` + +Alternatively, launch console.cloud.google.com > Select *ucb-datahub-2018* as +the project name. + +1. Click on the three horizontal bar icon at the top left corner. +2. Access "Compute Engine" > "VM instances" > and search for + "nfs-server-01". +3. Select "Open in browser window" option to access NFS server via + GUI. + +Back in the NFS utility VM shell, mount the new share: + +``` bash +mkdir /export/-filestore +mount :/shares /export/-filestore +``` + +Create `staging` and `prod` directories owned by `1000:1000` under +`/export/-filestore/`. The path *might* differ if your +hub has special home directory storage needs. Consult admins if that's +the case. Here is the command to create the directory with appropriate +permissions: + +``` bash +install -d -o 1000 -g 1000 \ + /export/-filestore//staging \ + /export/-filestore//prod +``` + +Check whether the directories have permissions similar to the below +directories: + +``` bash +drwxr-xr-x 4 ubuntu ubuntu 45 Nov 3 20:33 a11y-filestore +drwxr-xr-x 4 ubuntu ubuntu 33 Jan 4 2022 astro-filestore +drwxr-xr-x 4 ubuntu ubuntu 16384 Aug 16 18:45 biology-filestore +``` + +### Create the hub deployment locally + +In the `datahub/deployments` directory, run `cookiecutter`. This sets up +the hub's configuration directory: + +``` bash +cookiecutter template/ +``` + +The cookiecutter template will prompt you to provide the following information: + +: - ``: Enter the chosen name of the hub. + - ``: Default is `ucb-datahub-2018`, do not change. + - ``: Default is `spring-2024`, do not change. + - ``: Name of the node pool (shared or individual) to + deploy on. + - `hub_filestore_share`: Default is `shares`, do not change. + - `hub_filestore_ip`: Enter the IP address of the filestore + instance. This is available from the web console. + - `hub_filestore_capacity`: Enter the allocated storage capacity. + This is available from the web console. + +This will generate a directory with the name of the hub you provided +with a skeleton configuration and all the necessary secrets. + +### Configure filestore security settings and GCP billing labels + +If you have created a new filestore instance, you will now need to apply +the `ROOT_SQUASH` settings. Please ensure that you've already created +the hub's root directory and both `staging` and `prod` directories, +otherwise you will lose write access to the share. We also attach labels +to a new filestore instance for tracking individual and full hub costs. + +Skip this step if you are using an existing/shared filestore. + +``` bash +gcloud filestore instances update --zone=us-central1-b \ + --update-labels=hub=,filestore-deployment= \ + --flags-file=/config/filestore/squash-flags.json +``` + +### Authentication + +Set up authentication via [bcourses](https://bcourses.berkeley.edu). We +have two canvas OAuth2 clients setup in bcourses for us - one for all +production hubs and one for all staging hubs. The configuration and +secrets for these are provided by the cookiecutter template, however the +new hubs need to be added to the authorized callback list maintained in +bcourses. + +1. Use `sops` to edit `secrets/dev.yaml` and `secrets/prod.yaml`, replacing the + cookiecutter hub_name. `cookiecutter` can't do this for you since + the values are encrypted. + +2. Add `-staging.datahub.berkeley.edu/hub/oauth_callback` to the + staging hub client (id 10720000000000594) + +3. Add `.datahub.berkeley.edu/hub/oauth_callback` to the + production hub client (id 10720000000000472) + +Please reach out to Jonathan Felder to set this up, or + if he is not available. + +### CircleCI + +The CircleCI configuration file `.circleci/config.yml` will need to +include directives for building and deploying your new hub at several +phases of the CircleCI process. Generally speaking, an adequate manual +strategy for this is to pick the name of an existing hub, find each +occurrence of that name, and add analogous entries for your new hub +alongside your example existing hub. Please order new entries for your +new hub in alphabetical order amongst the entries for existing hubs. + +Here is a partial (but incomplete) sampling of some of the relevant +sections of the CircleCI configuration file: + +``` yaml +- run: + name: Deploy + command: | + hubploy deploy hub ${CIRCLE_BRANCH} + +- hubploy/build-image: + deployment: + name: image build + filters: + branches: + ignore: + - staging + - prod + + + - hubploy/build-image: + deployment: + name: image build + push: true + filters: + branches: + only: + - staging + + + - image build +``` + +Review hubploy.yaml file inside your project directory and update the +image name to the latest image. Something like this, + +``` yaml +image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/a11y-user-image +``` + +### Add hub to the github labeler workflow + +The new hub will now need to be added to the github labeler workflow. + +Edit the file `.github/labeler.yml` and add an entry for this hub +(alphabetically) in the +`# add hub-specific labels for deployment changes` block: + +``` yaml +"hub: ": + - "deployments//**" +``` + +### Create placeholder node pool + +Node pools have a configured minimum size, but our cluster has the +ability to set aside additional placeholder nodes. These are nodes that +get spun up in anticipation of the pool needing to suddenly grow in +size, for example when large classes begin. + +If you are deploying to a shared node pool, there is no need to perform +this step. + +Otherwise, you'll need to add the placeholder settings in +`node-placeholder/values.yaml`. + +The node placeholder pod should have enough RAM allocated to it that it +needs to be kicked out to get even a single user pod on the node - but +not so big that it can't run on a node where other system pods are +running! To do this, we'll find out how much memory is allocatable to +pods on that node, then subtract the sum of all non-user pod memory +requests and an additional 256Mi of "wiggle room". This final number +will be used to allocate RAM for the node placeholder. + +1. Launch a server on https://*hubname*.datahub.berkeley.edu +2. Get the node name (it will look something like + `gke-spring-2024-user-datahub-2023-01-04-fc70ea5b-67zs`): + `kubectl get nodes | grep *hubname* | awk '{print $1}'` +3. Get the total amount of memory allocatable to pods on this node and + convert to bytes: + ```bash + kubectl get node -o jsonpath='{.status.allocatable.memory}' + ``` +4. Get the total memory used by non-user pods/containers on this node. + We explicitly ignore `notebook` and `pause`. Convert to bytes and + get the sum: + ```bash + kubectl get -A pod -l 'component!=user-placeholder' \ + --field-selector spec.nodeName= \ + -o jsonpath='{range .items[*].spec.containers[*]}{.name}{"\t"}{.resources.requests.memory}{"\n"}{end}' \ + | egrep -v 'pause|notebook' + ``` + +1. Subract the second number from the first, and then subtract another + 277872640 bytes (256Mi) for "wiggle room". +2. Add an entry for the new placeholder node config in `values.yaml`: + +```yaml +data102: + nodeSelector: + hub.jupyter.org/pool-name: data102-pool + resources: + requests: + # Some value slightly lower than allocatable RAM on the node pool + memory: 60929654784 + replicas: 1 +``` + +For reference, here's example output from collecting and calculating +the values for `data102`: + +``` bash +(gcpdev) ➜ ~ kubectl get nodes | grep data102 | awk '{print$1}' +gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 +(gcpdev) ➜ ~ kubectl get node gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 -o jsonpath='{.status.allocatable.memory}' # convert to bytes +60055600Ki% +(gcpdev) ➜ ~ kubectl get -A pod -l 'component!=user-placeholder' \ +--field-selector spec.nodeName=gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 \ +-o jsonpath='{range .items[*].spec.containers[*]}{.name}{"\t"}{.resources.requests.memory}{"\n"}{end}' \ +| egrep -v 'pause|notebook' # convert all values to bytes, sum them +calico-node +fluentbit 100Mi +fluentbit-gke 100Mi +gke-metrics-agent 60Mi +ip-masq-agent 16Mi +kube-proxy +prometheus-node-exporter +(gcpdev) ➜ ~ # subtract the sum of the second command's values from the first value, then subtract another 277872640 bytes for wiggle room +(gcpdev) ➜ ~ # in this case: (60055600Ki - (100Mi + 100Mi + 60Mi + 16Mi)) - 256Mi +(gcpdev) ➜ ~ # (61496934400 - (104857600 + 104857600 + 16777216 + 62914560)) - 277872640 == 60929654784 +``` + +Besides setting defaults, we can dynamically change the placeholder +counts by either adding new, or editing existing, [calendar +events](https://docs.datahub.berkeley.edu/en/latest/admins/howto/calendar-scaler.html). +This is useful for large courses which can have placeholder nodes set +aside for predicatable periods of heavy ramp up. + +### Commit and deploy staging + +Commit the hub directory, and make a PR to the the `staging` branch in +the GitHub repo. Once tests pass, merge the PR to get a working staging +hub! It might take a few minutes for HTTPS to work, but after that you +can log into it at \-staging.datahub.berkeley.edu. +Test it out and make sure things work as you think they should. + +1. Make a PR from the `staging` branch to the `prod` branch. When this + PR is merged, it'll deploy the production hub. It might take a few + minutes for HTTPS to work, but after that you can log into it at + \.datahub.berkeley.edu. Test it out and make + sure things work as you think they should. +2. You may want to customize the docker image for the hub based on your + unique requirements. Navigate to deployments/'Project Name'/image + and review environment.yml file and identify packages that you want + to add from the `conda repository` \<\>. You + can copy the image manifest files from another deployment. It is + recommended to use a repo2docker-style image build, without a + Dockerfile, if possible. That format will probably serve as the + basis for self-service user-created images in the future. +3. All done. diff --git a/docs/admins/howto/new-hub.rst b/docs/admins/howto/new-hub.rst deleted file mode 100644 index 564f9e0e7..000000000 --- a/docs/admins/howto/new-hub.rst +++ /dev/null @@ -1,371 +0,0 @@ - -.. _howto/new-hub: - -================ -Create a new Hub -================ - - -Why create a new hub? -===================== - -The major reasons for making a new hub are: - -#. A new course wants to join the Berkeley Datahub community! -#. Some of your *students* are *admins* on another hub, - so they can see other students' work there. -#. You want to use a different kind of authenticator. -#. You are running in a different cloud, or using a different - billing account. -#. Your environment is different enough and specialized enough - that a different hub is a good idea. By default, everyone uses the - same image as datahub.berkeley.edu. -#. You want a different URL (X.datahub.berkeley.edu vs just - datahub.berkeley.edu) - -If your reason is something else, it probably needs some justification :) - -Prereqs -======= -Working installs of the following utilities: - - `sops `_ - - `hubploy `_ - - `hubploy docs `_ - - ``pip install hubploy`` - - `gcloud `_ - - `kubectl `_ - - `cookiecutter `_ - -Proper access to the following systems: - - Google Cloud IAM: owner - - Write access to the `datahub repo `_ - - CircleCI account linked to our org - -Setting up a new hub -==================== - -Name the hub ------------- -Choose the ```` (typically the course or department). This is permanent. - -Determine deployment needs --------------------------- -Before creating a new hub, have a discussion with the instructor about the system requirements, -frequency of assignments and how much storage will be required for the course. Typically, there -are three general "types" of hub: Heavy usage, general and small courses. - -Small courses will usually have one or two assignments per semester, and may only have 20 or -fewer users. - -General courses have up to ~500 users, but don't have large amount of data or require upgraded -compute resources. - -Heavy usage courses can potentially have thousands of users, require upgraded node specs and/or -have Terabytes of data each semester. - -Both general and heavy usage courses typically have weekly assignments. - -Small courses (and some general usage courses) can use either or both of a shared node pool and -filestore to save money (Basic HDD filestore instances start at 1T). - -This is also a good time to determine if there are any specific software packages/libraries that -need to be installed, as well as what language(s) the course will be using. This will determine -which image to use, and if we will need to add additional packages to the image build. - -If you're going to use an existing node pool and/or filestore instance, you can skip either or both of -the following steps and pick back up at the ``cookiecutter``. - -When creating a new hub, we also make sure to label the filestore and -GKE/node pool resouces with both ``hub`` and -``-deployment``. 99.999% of the time, the values for all -three of these labels will be ````. - -Creating a new node pool ------------------------- -Create the node pool: - -.. code:: bash - - gcloud container node-pools create "user--" \ - --labels=hub=,nodepool-deployment= \ - --node-labels hub.jupyter.org/pool-name=-pool \ - --machine-type "n2-highmem-8" \ - --enable-autoscaling --min-nodes "0" --max-nodes "20" \ - --project "ucb-datahub-2018" --cluster "spring-2024" \ - --region "us-central1" --node-locations "us-central1-b" \ - --node-taints hub.jupyter.org_dedicated=user:NoSchedule --tags hub-cluster \ - --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "200" \ - --metadata disable-legacy-endpoints=true \ - --scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \ - --no-enable-autoupgrade --enable-autorepair \ - --max-surge-upgrade 1 --max-unavailable-upgrade 0 --max-pods-per-node "110" - - -Creating a new filestore instance ---------------------------------- -Before you create a new filestore instance, be sure you know the capacity -required. The smallest amount you can allocate is 1T, but larger hubs may -require more. Confer with the admins and people instructing the course and -determine how much they think they will need. - -We can easily scale capacity up, but not down. - -From the command line, first fill in the instance name (``-``) -and ````, and then execute the following command: - -.. code:: bash - - gcloud filestore instances create - \ - --zone "us-central1-b" --tier="BASIC_HDD" \ - --file-share=capacity=1TiB,name=shares \ - --network=name=default,connect-mode=DIRECT_PEERING - -Or, from the web console, click on the horizontal bar icon at the top left -corner. - -#. Access "Filestore" -> "Instances" and click on "Create Instance". -#. Name the instance ``-`` -#. Instance Type is ``Basic``, Storage Type is ``HDD``. -#. Allocate capacity. -#. Set the region to ``us-central1`` and Zone to ``us-central1-b``. -#. Set the VPC network to ``default``. -#. Set the File share name to ``shares``. -#. Click "Create" and wait for it to be deployed. -#. Once it's deployed, select the instance and copy the "NFS mount point". - -Your new (but empty) NFS filestore must be seeded with a pair of directories. We run a utility VM for -NFS filestore management; follow the steps below to connect to this utility VM, mount your new filestore, -and create & configure the required directories. - -You can run the following command in gcloud terminal to log in to the NFS utility VM: - -``gcloud compute ssh nfsserver-01 --zone=us-central1-b`` - -Alternatively, launch console.cloud.google.com -> Select "ucb-datahub-2018" as the project name. - -#. Click on the three horizontal bar icon at the top left corner. -#. Access "Compute Engine" -> "VM instances" -> and search for "nfs-server-01". -#. Select "Open in browser window" option to access NFS server via GUI. - -Back in the NFS utility VM shell, mount the new share: - -.. code:: bash - - mkdir /export/-filestore - mount :/shares /export/-filestore - -Create ``staging`` and ``prod`` directories owned by ``1000:1000`` under -``/export/-filestore/``. The path *might* differ if -your hub has special home directory storage needs. Consult admins if that's -the case. Here is the command to create the directory with appropriate permissions: - -.. code:: bash - - install -d -o 1000 -g 1000 \ - /export/-filestore//staging \ - /export/-filestore//prod - -Check whether the directories have permissions similar to the below directories: - -.. code:: bash - - drwxr-xr-x 4 ubuntu ubuntu 45 Nov 3 20:33 a11y-filestore - drwxr-xr-x 4 ubuntu ubuntu 33 Jan 4 2022 astro-filestore - drwxr-xr-x 4 ubuntu ubuntu 16384 Aug 16 18:45 biology-filestore - -Create the hub deployment locally ---------------------------------- -In the ``datahub/deployments`` directory, run ``cookiecutter``. This sets up the hub's configuration directory: - -.. code:: bash - - cookiecutter template/ - -The cookiecutter template will prompt you to provide the following information: - - ````: Enter the chosen name of the hub. - - ````: Default is ``ucb-datahub-2018``, do not change. - - ````: Default is ``spring-2024``, do not change. - - ````: Name of the node pool (shared or individual) to deploy on. - - ``hub_filestore_share``: Default is ``shares``, do not change. - - ``hub_filestore_ip``: Enter the IP address of the filestore instance. This is available from the web console. - - ``hub_filestore_capacity``: Enter the allocated storage capacity. This is available from the web console. - -This will generate a directory with the name of the hub you provided with a skeleton configuration and all the necessary secrets. - -Configure filestore security settings and GCP billing labels ------------------------------------------------------------- -If you have created a new filestore instance, you will now need to apply the -``ROOT_SQUASH`` settings. Please ensure that you've already created the hub's -root directory and both ``staging`` and ``prod`` directories, otherwise you will -lose write access to the share. We also attach labels to a new filestore -instance for tracking individual and full hub costs. - -Skip this step if you are using an existing/shared filestore. - -.. code:: bash - - gcloud filestore instances update --zone=us-central1-b \ - --update-labels=hub=,filestore-deployment= \ - --flags-file=/config/filestore/squash-flags.json - -Authentication --------------- -Set up authentication via `bcourses `_. -We have two canvas OAuth2 clients setup in bcourses for us - one for all -production hubs and one for all staging hubs. The configuration and secrets -for these are provided by the cookiecutter template, however the new hubs -need to be added to the authorized callback list maintained in bcourses. - -#. Use `sops` to edit `secrets/dev.yaml` and `secrets/prod.yaml`, replacing the cookiecutter hub_name. `cookiecutter` can't do this for you since the values are encrypted. -#. Add ``-staging.datahub.berkeley.edu/hub/oauth_callback`` to the - staging hub client (id 10720000000000594) -#. Add ``.datahub.berkeley.edu/hub/oauth_callback`` to the - production hub client (id 10720000000000472) - -Please reach out to Jonathan Felder to set this up, or -bcourseshelp@berkeley.edu if he is not available. - -CircleCI --------- -The CircleCI configuration file ``.circleci/config.yml`` will need to include directives for building -and deploying your new hub at several phases of the CircleCI process. -Generally speaking, an adequate manual strategy for this is to pick the name of an existing hub, -find each occurrence of that name, and add analogous entries for your new hub alongside your example existing hub. -Please order new entries for your new hub in alphabetical order amongst the entries for existing hubs. - -Here is a partial (but incomplete) sampling of some of the relevant sections of the CircleCI configuration file: - -.. code:: yaml - - - run: - name: Deploy - command: | - hubploy deploy hub ${CIRCLE_BRANCH} - -.. code:: yaml - - - hubploy/build-image: - deployment: - name: image build - filters: - branches: - ignore: - - staging - - prod - - - - hubploy/build-image: - deployment: - name: image build - push: true - filters: - branches: - only: - - staging - - - - image build - -Review hubploy.yaml file inside your project directory and update the image name to the latest image. Something like this, - -.. code:: yaml - - image_name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/a11y-user-image - -Add hub to the github labeler workflow --------------------------------------- -The new hub will now need to be added to the github labeler workflow. - -Edit the file ``.github/labeler.yml`` and add an entry for this hub (alphabetically) in the -``# add hub-specific labels for deployment changes`` block: - -.. code:: yaml - - "hub: ": - - "deployments//**" - -Create placeholder node pool ----------------------------- -Node pools have a configured minimum size, but our cluster has the ability to set aside additional placeholder nodes. These are nodes that get spun up in anticipation of the pool needing to suddenly grow in size, for example when large classes begin. - -If you are deploying to a shared node pool, there is no need to perform this step. - -Otherwise, you'll need to add the placeholder settings in ``node-placeholder/values.yaml``. - -The node placeholder pod should have enough RAM allocated to it that it needs to be kicked out to get even a single user pod on the node - but not so big that it can't run on a node where other system pods are running! To do this, we'll find out how much memory is allocatable to pods on that node, then subtract the sum of all non-user pod memory requests and an additional 256Mi of "wiggle room". This final number will be used to allocate RAM for the node placeholder. - -#. Launch a server on https://.datahub.berkeley.edu -#. Get the node name (it will look something like ``gke-spring-2024-user-datahub-2023-01-04-fc70ea5b-67zs``): ``kubectl get nodes | grep | awk '{print$1}'`` -#. Get the total amount of memory allocatable to pods on this node and convert to bytes: ``kubectl get node -o jsonpath='{.status.allocatable.memory}'`` -#. Get the total memory used by non-user pods/containers on this node. We explicitly ignore ``notebook`` and ``pause``. Convert to bytes and get the sum: - -.. code:: bash - - kubectl get -A pod -l 'component!=user-placeholder' \ - --field-selector spec.nodeName= \ - -o jsonpath='{range .items[*].spec.containers[*]}{.name}{"\t"}{.resources.requests.memory}{"\n"}{end}' \ - | egrep -v 'pause|notebook' - -#. Subract the second number from the first, and then subtract another 277872640 bytes (256Mi) for "wiggle room". -#. Add an entry for the new placeholder node config in ``values.yaml``: - -.. code:: yaml - - data102: - nodeSelector: - hub.jupyter.org/pool-name: data102-pool - resources: - requests: - # Some value slightly lower than allocatable RAM on the node pool - memory: 60929654784 - replicas: 1 - -For reference, here's example output from collecting and calculating the values for ``data102``: - -.. code:: bash - - (gcpdev) ➜ ~ kubectl get nodes | grep data102 | awk '{print$1}' - gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 - (gcpdev) ➜ ~ kubectl get node gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 -o jsonpath='{.status.allocatable.memory}' # convert to bytes - 60055600Ki% - (gcpdev) ➜ ~ kubectl get -A pod -l 'component!=user-placeholder' \ - --field-selector spec.nodeName=gke-spring-2024-user-data102-2023-01-05-e02d4850-t478 \ - -o jsonpath='{range .items[*].spec.containers[*]}{.name}{"\t"}{.resources.requests.memory}{"\n"}{end}' \ - | egrep -v 'pause|notebook' # convert all values to bytes, sum them - calico-node - fluentbit 100Mi - fluentbit-gke 100Mi - gke-metrics-agent 60Mi - ip-masq-agent 16Mi - kube-proxy - prometheus-node-exporter - (gcpdev) ➜ ~ # subtract the sum of the second command's values from the first value, then subtract another 277872640 bytes for wiggle room - (gcpdev) ➜ ~ # in this case: (60055600Ki - (100Mi + 100Mi + 60Mi + 16Mi)) - 256Mi - (gcpdev) ➜ ~ # (61496934400 - (104857600 + 104857600 + 16777216 + 62914560)) - 277872640 == 60929654784 - - -Besides setting defaults, we can dynamically change the placeholder counts by either adding new, or editing existing, `calendar events `_. This is useful for large courses which can have placeholder nodes set aside for predicatable periods of heavy ramp up. - -Commit and deploy staging -------------------------- -Commit the hub directory, and make a PR to the the ``staging`` branch in the -GitHub repo. Once tests pass, merge the PR to get a working staging hub! It -might take a few minutes for HTTPS to work, but after that you can log into -it at https://-staging.datahub.berkeley.edu. Test it out and make -sure things work as you think they should. - -#. Make a PR from the ``staging`` branch to the ``prod`` branch. When this PR is - merged, it'll deploy the production hub. It might take a few minutes for HTTPS - to work, but after that you can log into it at - https://.datahub.berkeley.edu. Test it out and make sure things - work as you think they should. - -#. You may want to customize the docker image for the hub based on your unique - requirements. Navigate to deployments/'Project Name'/image and review - environment.yml file and identify packages that you want to add from - the ``conda repository`` . You can copy the image manifest - files from another deployment. It is recommended to use a repo2docker-style image - build, without a Dockerfile, if possible. That format will probably serve as the ' - basis for self-service user-created images in the future. - -#. All done. diff --git a/docs/admins/howto/new-image.qmd b/docs/admins/howto/new-image.qmd new file mode 100644 index 000000000..2085d12dc --- /dev/null +++ b/docs/admins/howto/new-image.qmd @@ -0,0 +1,116 @@ +--- +title: Creating a new single user image +--- + +When deploying a new hub, or moving from a shared single user server +image, you might need to create a new image for users. We use +[repo2docker](https://github.com/jupyterhub/repo2docker) to do this. + +There are two approaches to creating a repo2docker image: 1. Use a +repo2docker-style image +[template](https://github.com/berkeley-dsep-infra/datahub/tree/staging/deployments/data100/image) +(environment.yaml, etc) 2. Use a +[Dockerfile](https://github.com/berkeley-dsep-infra/datahub/tree/staging/deployments/datahub/images/default) +(useful for larger/more complex images) + +Generally, we prefer to use the former approach, unless we need to +install specific packages or utilities outside of python/apt as `root`. +If that is the case, only a `Dockerfile` format will work. + +Of course, as always create a feature branch for your changes, and +submit a PR when done. + +## Find a hub to use as a template + +Browse through our `deployments/` directory to find a hub that is similar to +the one you are trying to create. This will give you a good starting point. + +## Create the `image/` directory for your new hub + +Create a new directory under `deployments/` with the name of your hub. This +directory will contain the files that will be used to create the image. + +Then, copy the contents (and any subdirectories) of the source +`image/` directory in to the new directory. + +## Modify `hubploy.yaml` for the hub + +In the deployment\'s `hubploy.yaml` file, +add or modify the `name`, `path` and `base_image` fields to configure +the image build and where it\'s stored in the Google Artifcat Registry. + +`name` should contain the path to the image in the Google Artifact +Registry and the name of the image. `path` points to the directory +containing the image configuration (typically :file::`image/`. `base_image` is +the base Docker image to use for the image build. + +For example, `hubploy.yaml` for the data100 image looks like this: + +``` yaml +images: + images: + - name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/data100-user-image + path: image/ + repo2docker: + base_image: docker.io/library/buildpack-deps:jammy + registry: + provider: gcloud + gcloud: + project: ucb-datahub-2018 + service_key: gcr-key.json + +cluster: +provider: gcloud +gcloud: + project: ucb-datahub-2018 + service_key: gke-key.json + cluster: spring-2024 + zone: us-central1 +``` + +## Modify the image configuration as necessary + +This step is straightforward: edit/modify/delete/add any files in the +`image/` directory to configure the image +as needed. + +## Update CI/CD configuration + +Next, ensure that this image will be built and deployed by updating the +`.circleci/config.yml` file in the root +of the repository. Add new steps under the `jobs/deploy:`, +`workflows/test-build-images:` and `workflows/deploy:` stanzas. + +## Submitting a pull request + +Familiarize yourself with [pull +requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) +and [repo2docker](https://github.com/jupyter/repo2docker) , and create a +fork of the [datahub staging +branch](https://github.com/berkeley-dsep-infra/datahub). + +1. Set up your git/dev environment by [following the instructions + here](https://github.com/berkeley-dsep-infra/datahub/#setting-up-your-fork-and-clones). + +2. Create a new branch for this PR. + +3. + + Test the changes locally using `repo2docker`, then submit a PR to `staging`. + + : - To use `repo2docker`, you have to point it at the correct + image directory. For example, to build the data100 image, + you would run `repo2docker deployments/data100/image` from + the base datahub directory. + +4. Commit and push your changes to your fork of the datahub repo, and + create a new pull request at + . + +5. Once the PR is merged to staging and the new image is built and + pushed to Artifact Registry, you can test it out on + `-staging.datahub.berkeley.edu`. + +6. Changes are only deployed to prod once the relevant CI job is + completed. See + to view CircleCI job statuses. diff --git a/docs/admins/howto/new-image.rst b/docs/admins/howto/new-image.rst deleted file mode 100644 index 22b5cb549..000000000 --- a/docs/admins/howto/new-image.rst +++ /dev/null @@ -1,97 +0,0 @@ -.. _howto/new-packages: - -================================ -Creating a new single user image -================================ - -When deploying a new hub, or moving from a shared single user server image, -you might need to create a new image for users. We use -`repo2docker `_ to do this. - -There are two approaches to creating a repo2docker image: -1. Use a repo2docker-style image `template `_ (environment.yaml, etc) -2. Use a `Dockerfile `_ (useful for larger/more complex images) - -Generally, we prefer to use the former approach, unless we need to install -specific packages or utilities outside of python/apt as ``root``. If that is -the case, only a :file:`Dockerfile` format will work. - -Of course, as always create a feature branch for your changes, and submit a -PR when done. - -Find a hub to use as a template -=============================== - -Browse through our :file:`deployments/` directory to find a hub that is similar to -the one you are trying to create. This will give you a good starting point. - -Create the :file:`image/` directory for your new hub -==================================================== - -Create a new directory under :file:`deployments/` with the name of your hub. This -directory will contain the files that will be used to create the image. - -Then, copy the contents (and any subdirectories) of the source :file:`image/` -directory in to the new directory. - -Modify :file:`hubploy.yaml` for the hub -======================================= - -In the deployment's :file:`hubploy.yaml` file, add or modify the ``name``, ``path`` and -``base_image`` fields to configure the image build and where it's stored in the -Google Artifcat Registry. - -``name`` should contain the path to the image in the Google Artifact Registry and the name of the image. -``path`` points to the directory containing the image configuration (typically :file::`image/`). -``base_image`` is the base Docker image to use for the image build. - -For example, :file:`hubploy.yaml` for the data100 image looks like this: - -.. code:: yaml - - images: - images: - - name: us-central1-docker.pkg.dev/ucb-datahub-2018/user-images/data100-user-image - path: image/ - repo2docker: - base_image: docker.io/library/buildpack-deps:jammy - registry: - provider: gcloud - gcloud: - project: ucb-datahub-2018 - service_key: gcr-key.json - - cluster: - provider: gcloud - gcloud: - project: ucb-datahub-2018 - service_key: gke-key.json - cluster: spring-2024 - zone: us-central1 - -Modify the image configuration as necessary -=========================================== - -This step is straightforward: edit/modify/delete/add any files in the :file:`image/` -directory to configure the image as needed. - -Update CI/CD configuration -========================== - -Next, ensure that this image will be built and deployed by updating the -:file:`.circleci/config.yml` file in the root of the repository. Add new steps -under the ``jobs/deploy:``, ``workflows/test-build-images:`` and ``workflows/deploy:`` -stanzas. - -Submitting a pull request -========================= - -Familiarize yourself with `pull requests `_ and `repo2docker `_ , and create a fork of the `datahub staging branch `_. - -#. Set up your git/dev environment by `following the instructions here `_. -#. Create a new branch for this PR. -#. Test the changes locally using ``repo2docker``, then submit a PR to ``staging``. - * To use ``repo2docker``, you have to point it at the correct image directory. For example, to build the data100 image, you would run ``repo2docker deployments/data100/image`` from the base datahub directory. -#. Commit and push your changes to your fork of the datahub repo, and create a new pull request at ``__. -#. Once the PR is merged to staging and the new image is built and pushed to Artifact Registry, you can test it out on :code:`-staging.datahub.berkeley.edu`. -#. Changes are only deployed to prod once the relevant CI job is completed. See ``__ to view CircleCI job statuses. diff --git a/docs/admins/howto/new-packages.qmd b/docs/admins/howto/new-packages.qmd new file mode 100644 index 000000000..782380d31 --- /dev/null +++ b/docs/admins/howto/new-packages.qmd @@ -0,0 +1,126 @@ +--- +title: Testing and Upgrading New Packages +--- + +It is helpful to test package additions and upgrades for yourself before +they are installed for all users. You can make sure the change behaves +as you think it should, and does not break anything else. Once tested, +request that the change by installed for all users by by [creating a new +issue in +github](https://github.com/berkeley-dsep-infra/datahub/issues),contacting +cirriculum support staff, or creating a new pull request. Ultimately, +thouroughly testing changes locally and submitting a pull request will +result in the software being rolled out to everyone much faster. + +Install a python package in your notebook +================================== + +When testing a notebook with new version of the package, add the +following line to a cell at the beginning of your notebook. + +``` bash +!pip install --upgrade packagename==version +``` + +You can then execute this cell every time you run the notebook. This +will ensure you have the version you think you have when running your +code. + +To avoid complicated errors, make sure you always specify a version. You +can find the latest version by searching on +[pypi.org](https://pypi.org). + +Find current version of a python package =============================== + +To find the current version of a particular installed package, you can +run the following in a notebook. + +``` bash +!pip list | grep +``` + +This should show you the particular package you are interested in and +its current version. + +Install/Update a R package in your RStudio +================================== + +When the required version of package is missing in the R Studio, Try the +following command to check whether the default installation repo +contains the package (and the version) required. + +``` R +install.packages("packagename") +``` + +This should install the particular package you are interested in and its +latest version. You can find the latest version of a R package by +searching on [CRAN](https://cran.r-project.org/). + +Find current version of a R package =============================== + +To find the current version of a particular installed package, you can +run the following in RStudio. + +``` R +packageVersion("") +``` + +This should show you the particular package you are interested in and +its current version. + +## Submitting a pull request + +Familiarize yourself with [pull +requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) +and [repo2docker](https://github.com/jupyter/repo2docker) , and create a +fork of the [datahub staging +branch](https://github.com/berkeley-dsep-infra/datahub). + +1. Set up your git/dev environment by [following the instructions + here](https://github.com/berkeley-dsep-infra/datahub/#setting-up-your-fork-and-clones). + +2. Create a new branch for this PR. + +3. Find the correct `environment.yml`{.interpreted-text role="file"} + file for your class. This should be under + `datahub/deployments//image` + +4. In `environment.yml`{.interpreted-text role="file"}, packages listed + under `dependencies` are installed using `conda`, while packages + under `pip` are installed using `pip`. Any packages that need to be + installed via `apt` must be added to either + `datahub/deployments//image/apt.txt` or + `datahub/deployments//image/Dockerfile`. + +5. Add any packages necessary. We typically prefer using `conda` packages, and `pip` only if necessary. Please pin to a specific version (no wildards, etc). + + - Note that package versions for `conda` are specified using + `=`, while in `pip` they are specified using `==` + +6. Test the changes locally using `repo2docker`, then submit a PR to `staging`. + + - To use `repo2docker`, you have to point it at the right + Dockerfile for your class. For example, to test the data100 + datahub, you would run `repo2docker deployments/data100/image` from the + base datahub directory. + +7. Commit and push your changes to your fork of the datahub repo, and + create a new pull request at + . + +8. Once the PR is merged to staging, you can test it out on + `class-staging.datahub.berkeley.edu`. + +9. Changes are only deployed to datahub once the relevant Travis CI job + is completed. See + to view Travis + CI job statuses. + +## Tips for Upgrading Package + +- Conda can take an extremely long time to resolve version dependency + conflicts, if they are resolvable at all. When upgrading Python + versions or a core package that is used by many other packages, such + as [requests]{.title-ref}, clean out or upgrade old packages to + minimize the number of dependency conflicts. diff --git a/docs/admins/howto/new-packages.rst b/docs/admins/howto/new-packages.rst deleted file mode 100644 index 8479701f2..000000000 --- a/docs/admins/howto/new-packages.rst +++ /dev/null @@ -1,89 +0,0 @@ -.. _howto/new-packages: - -================================== -Testing and Upgrading New Packages -================================== - -It is helpful to test package additions and upgrades for yourself before they -are installed for all users. You can make sure the change behaves as you think -it should, and does not break anything else. Once tested, request that the -change by installed for all users by by `creating a new issue in github -`_,contacting -cirriculum support staff, or creating a new pull request. Ultimately, -thouroughly testing changes locally and submitting a pull request will -result in the software being rolled out to everyone much faster. - -Install a python package in your notebook -================================== - -When testing a notebook with new version of the package, add the following line -to a cell at the beginning of your notebook. - - .. code:: bash - - !pip install --upgrade packagename==version - -You can then execute this cell every time you run the notebook. This will -ensure you have the version you think you have when running your code. - -To avoid complicated errors, make sure you always specify a version. You -can find the latest version by searching on `pypi.org `_. - -Find current version of a python package -=============================== - -To find the current version of a particular installed package, you can -run the following in a notebook. - - .. code:: bash - - !pip list | grep - -This should show you the particular package you are interested in and its -current version. - -Install/Update a R package in your RStudio -================================== - -When the required version of package is missing in the R Studio, Try the following command to check whether the default installation repo contains the package (and the version) required. - - .. code:: bash - - install.packages("packagename") - -This should install the particular package you are interested in and its latest version. You can find the latest version of a R package by searching on `CRAN `_. - -Find current version of a R package -=============================== - -To find the current version of a particular installed package, you can -run the following in RStudio. - - .. code:: bash - - packageVersion("") - -This should show you the particular package you are interested in and its -current version. - - -Submitting a pull request -========================= - -Familiarize yourself with `pull requests `_ and `repo2docker `_ , and create a fork of the `datahub staging branch `_. - -#. Set up your git/dev environment by `following the instructions here `_. -#. Create a new branch for this PR. -#. Find the correct :file:`environment.yml` file for your class. This should be under ``datahub/deployments//image`` -#. In :file:`environment.yml`, packages listed under :code:`dependencies` are installed using :code:`conda`, while packages under :code:`pip` are installed using :code:`pip`. Any packages that need to be installed via :code:`apt` must be added to either ``datahub/deployments//image/apt.txt`` or ``datahub/deployments//image/Dockerfile``. -#. Add any packages necessary. We typically prefer using :code:`conda` packages, and :code:`pip` only if necessary. Please pin to a specific version (no wildards, etc). - * Note that package versions for :code:`conda` are specified using :code:`=`, while in :code:`pip` they are specified using :code:`==` -#. Test the changes locally using :code:`repo2docker`, then submit a PR to ``staging``. - * To use ``repo2docker``, you have to point it at the right Dockerfile for your class. For example, to test the data100 datahub, you would run ``repo2docker deployments/data100/image`` from the base datahub directory. -#. Commit and push your changes to your fork of the datahub repo, and create a new pull request at ``__. -#. Once the PR is merged to staging, you can test it out on :code:`class-staging.datahub.berkeley.edu`. -#. Changes are only deployed to datahub once the relevant Travis CI job is completed. See ``__ to view Travis CI job statuses. - -Tips for Upgrading Package -========================== -* Conda can take an extremely long time to resolve version dependency conflicts, if they are resolvable at all. When upgrading Python versions or a core package that is used by many other packages, such as `requests`, clean out or upgrade old packages to minimize the number of dependency conflicts. diff --git a/docs/admins/howto/preview-local.qmd b/docs/admins/howto/preview-local.qmd new file mode 100644 index 000000000..0ed0a3a68 --- /dev/null +++ b/docs/admins/howto/preview-local.qmd @@ -0,0 +1,18 @@ +--- +title: Develop Documentation +--- + +## Live Preview + +Navigate to the `docs` directory and run `quarto preview`. You can view the +documentation in a browser while you make changes. + +## Render Static HTML + +Navigate to the `docs` directory and run `quarto render`. This will build the +entire website in the `_site` directory. You can then open files in your web +browser. + +You can also render individual files, which saves time if you do not want to +render the whole site. Run `quarto render ./path/to/filename.qmd`, and then open +the corresponding HTML file in the _site directory. diff --git a/docs/admins/howto/preview-local.rst b/docs/admins/howto/preview-local.rst deleted file mode 100644 index ff3d0ff35..000000000 --- a/docs/admins/howto/preview-local.rst +++ /dev/null @@ -1,48 +0,0 @@ -.. _howto/preview-local: - -====================================== -Preview documentation changes locally -====================================== - - -Strategy 1: convert to html (method 1) -====================================== - -#. Create a virtual environment (Recommendation is to create conda environment) -#. Navigate to `datahub/` directory and run - - .. code:: bash - - pip install -r docs/requirements.txt - -#. Navigate to `docs/` directory. Run the following command, - - .. code:: bash - - make html - -#. Navigate to `docs/_build/html` directory and open index.html file in the browser. - -#. Have fun making changes to the documentation based on the HTML preview. - -Strategy 2: convert to html (method 2) -====================================== - - - .. code:: bash - - $ conda install sphinx - $ rst2html.py .rst > .html - -Strategy 3: free online previewer -====================================== - -Use a free, online, in-line converter such as: https://www.tutorialspoint.com/online_restructure_editor.php - -Strategy 4: Visual Studio code extension -======================================== - -The VSCode extension `RSTPreview -`_ provides a live preview of -your reStructuredText file as you edit it within VSCode. You may have to check the extension’s settings -to set a non-conflicting keystroke sequence for opening the preview pane. \ No newline at end of file diff --git a/docs/admins/howto/prometheus-grafana.qmd b/docs/admins/howto/prometheus-grafana.qmd new file mode 100644 index 000000000..50249223d --- /dev/null +++ b/docs/admins/howto/prometheus-grafana.qmd @@ -0,0 +1,35 @@ +--- +title: Prometheus and Grafana +--- + +# Accessing the Prometheus Server + +It can be useful to interact with the cluster's prometheus server while +developing dashboards in grafana. You will need to forward a local port +to the prometheus server's pod. + +## Using the standard port + +Listen on port 9090 locally, forwarding to the prometheus server's port +`9090`. + +``` bash +kubectl -n support port-forward deployment/support-prometheus-server 9090 +``` + +then visit http://localhost:9090. + +## Using an alternative port + +Listen on port 8000 locally, forwarding to the prometheus server's port `9090`. + +``` bash +kubectl -n support port-forward deployment/support-prometheus-server 8000:9090 +``` + +then visit http://localhost:8000. + +# Grafana + +Our Grafana dashboards are at https://grafana.datahub.berkeley.edu. +Upstream documentation is at https://jupyterhub-grafana.readthedocs.io/en/latest/index.html. diff --git a/docs/admins/howto/prometheus-grafana.rst b/docs/admins/howto/prometheus-grafana.rst deleted file mode 100644 index 65f870bcf..000000000 --- a/docs/admins/howto/prometheus-grafana.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. _howto/prometheus-grafana: - -================ -Prometheus and Grafana -================ - -Accessing the Prometheus Server -=============================== -It can be useful to interact with the cluster's prometheus server while developing dashboards in grafana. You will need to forward a local port to the prometheus server's pod. - -Using the standard port ------------------------- - -Listen on port 9090 locally, forwarding to the prometheus server's port `9090`: - - .. code:: bash - - kubectl -n support port-forward deployment/support-prometheus-server 9090 - -then visit ``__. - -Using an alternative port -------------------------- - -Listen on port 8000 locally, forwarding to the prometheus server's port `9090`: - - .. code:: bash - - kubectl -n support port-forward deployment/support-prometheus-server 8000:9090 - -then visit ``__. - - -Grafana -======= -Our Grafana dashboards are at https://grafana.datahub.berkeley.edu. Upstream documentation is at https://jupyterhub-grafana.readthedocs.io/en/latest/index.html. diff --git a/docs/admins/howto/rebuild-hub-image.qmd b/docs/admins/howto/rebuild-hub-image.qmd new file mode 100644 index 000000000..7c385a0d5 --- /dev/null +++ b/docs/admins/howto/rebuild-hub-image.qmd @@ -0,0 +1,31 @@ +--- +title: "Customize the Hub Docker Image" +--- + +We use a customized JupyterHub docker image so we can install extra packages +such as authenticators. The image is located in `images/hub`. It *must* inherit +from the JupyterHub image used in the [Zero to +JupyterHub](https://z2jh.jupyter.og). + +The image is build with [chartpress](https://github.com/jupyterhub/chartress), +which also updates `hub/values.yaml` with the new image version. +`chartpress` may be installed locally with `pip install chartpress`. + +1. Run `gcloud auth configure-docker us-central1-docker.pkg.dev` *once + per machine* to setup docker for authentication with the [gcloud + credential + helper](https://cloud.google.com/artifact-registry/docs/docker/authentication). +2. Modify the image in `images/hub` and make a git commit. +3. Run `chartpress --push`. This will build and push the hub image, and + modify `hub/values.yaml` appropriately. +4. Make a commit with the `hub/values.yaml` file, so the new hub image + name and tag are comitted. +5. Proceed to deployment as normal. + +Some of the following commands may be required to configure your +environment to run the above chartpress workflow successfully: + + - `gcloud auth login`. + - `gcloud auth configure-docker us-central1-docker.pkg.dev` + - `gcloud auth application-default login` + - `gcloud auth configure-docker` diff --git a/docs/admins/howto/rebuild-hub-image.rst b/docs/admins/howto/rebuild-hub-image.rst deleted file mode 100644 index 24d134805..000000000 --- a/docs/admins/howto/rebuild-hub-image.rst +++ /dev/null @@ -1,66 +0,0 @@ -.. _howto/rebuild-hub-image: - -============================ -Rebuild a custom hub image -============================ - -We use a customized JupyterHub image so we can use versions of -hub packages (such as authenticators) and install additional -software required by custom config we might have. - -The image is located in ``images/hub``. It *must* inherit from -the JupyterHub image used in the `Zero to JupyterHub `_. - -`chartpress `_ is used to -build the image and update ``hub/values.yaml`` with the new image -version. ``chartpress`` may be installed locally with ``pip install chartpress``. - -#. Run ``gcloud auth configure-docker us-central1-docker.pkg.dev`` - *once per machine* to setup docker for authentication with - the `gcloud credential helper `_. - -#. Modify the image in ``images/hub`` and make a git commit. - -#. Run ``chartpress --push``. This will build and push the hub image, - and modify ``hub/values.yaml`` appropriately. - -#. Make a commit with the ``hub/values.yaml`` file, so the new hub image - name and tag are comitted. - -#. Proceed to deployment as normal. - -Some of the following commands may be required to configure your environment to run -the above chartpress workflow successfully: - -* ``gcloud auth login`` -* ``gcloud auth configure-docker us-central1-docker.pkg.dev`` -* ``gcloud auth application-default login`` -* sometimes running ``gcloud auth login`` additional time(s) may fix issues -* ``sudo usermod -a -G docker ${USER}`` -* ``gcloud auth configure-docker`` - -================================= -Rebuild the custom postgres image -================================= - -For data100, we provide a postgresql server per user. We want the -`python extension `_ -installed. So we inherit from the `upstream postgresql docker image -`_, and add the appropriate package. - -This image is in ``images/postgres``. If you update it, you need to -rebuild and push it. - -#. Modify the image in ``images/postgres`` and make a git commit. - -#. Run ``chartpress --push``. This will build and push the image, - *but not put anything in YAML*. There is no place we can put thi - in ``values.yaml``, since this is only used for data100. - -#. Notice the image name + tag from the ``chartpress --push`` command, - and put it in the appropriate place (under ``extraContainers``) in - ``data100/config/common.yaml``. - -#. Make a commit with the new tag in ``data100/config/common.yaml``. - -#. Proceed to deploy as normal. \ No newline at end of file diff --git a/docs/admins/howto/rebuild-postgres-image.qmd b/docs/admins/howto/rebuild-postgres-image.qmd new file mode 100644 index 000000000..bd5ec8c65 --- /dev/null +++ b/docs/admins/howto/rebuild-postgres-image.qmd @@ -0,0 +1,22 @@ +--- +title: "Customize the Per-User Postgres Docker Image" +--- + +We provide each student on `data100` witha postgresql server. We want the +[python extension](https://www.postgresql.org/docs/current/plpython.html) +installed. So we inherit from the [upstream postgresql docker +image](https://hub.docker.com/_/postgres), and add the appropriate +package. + +This image is in `images/postgres`. If you update it, you need to +rebuild and push it. + +1. Modify the image in `images/postgres` and make a git commit. +2. Run `chartpress --push`. This will build and push the image, *but + not put anything in YAML*. There is no place we can put thi in + `values.yaml`, since this is only used for data100. +3. Notice the image name + tag from the `chartpress --push` command, + and put it in the appropriate place (under `extraContainers`) in + `data100/config/common.yaml`. +4. Make a commit with the new tag in `data100/config/common.yaml`. +5. Proceed to deploy as normal. diff --git a/docs/admins/howto/remove-users-orm.rst b/docs/admins/howto/remove-users-orm.qmd similarity index 59% rename from docs/admins/howto/remove-users-orm.rst rename to docs/admins/howto/remove-users-orm.qmd index ab11565dc..5c797dfe3 100644 --- a/docs/admins/howto/remove-users-orm.rst +++ b/docs/admins/howto/remove-users-orm.qmd @@ -1,28 +1,27 @@ -.. _howto/remove-users-orm: - -============================ -Remove inactive users from hub ORM -============================ +--- +title: Remove inactive users from hub ORM +--- JupyterHub performance sometimes scales with the *total* number of users in its ORM database, rather than the number of running users. Reducing the user count enables the hub to restart much faster. While this issue should be addressed, we can work around it by deleting inactive users from the hub database once in a while. Note that this does not delete the user's storage. - The script `scripts/delete-unused-users.py` will delete anyone who hasn't registered any activity in a given period of time, double checking to make sure they aren't active right now. This will require users to log in again the next time they use the hub, but that is probably fine. - This should be done before the start of each semester, particularly on hubs -with a lot of users. +with a lot of users. -Run the script -============== +## Run the script -You can run the script on your own device. The script depends on the `jhub_client` python library. This can be installed with `pip install jhub_client`. +You can run the script on your own device. The script depends on the +`jhub_client` python library. This can be installed with +`pip install jhub_client`. -#. You will need to acquire a JupyterHub API token with administrative rights. A hub admin can go to {hub_url}/hub/token to create a new one. -#. Set the environment variable `JUPYTERHUB_API_TOKEN` to the token. -#. Run `python scripts/delete-unused-users.py --hub_url {hub_url}` +1. You will need to acquire a JupyterHub API token with administrative + rights. A hub admin can go to `{hub_url}/hub/token` to create a new + one. +2. Set the environment variable `JUPYTERHUB_API_TOKEN` to the token. +3. Run `python scripts/delete-unused-users.py --hub_url {hub_url}` diff --git a/docs/admins/incidents/index.rst b/docs/admins/incidents/index.rst deleted file mode 100644 index d54b97640..000000000 --- a/docs/admins/incidents/index.rst +++ /dev/null @@ -1,28 +0,0 @@ -================ -Incident reports -================ - -Blameless incident reports are very important for long term sustainability -of resilient infrastructure. We publish them here for transparency, and -so we may learn from them for future incidents. - -.. toctree:: - :maxdepth: 1 - - 2017-02-09-datahub-db-outage - 2017-02-24-autoscaler-incident - 2017-02-24-proxy-death-incident - 2017-03-06-helm-config-image-mismatch - 2017-03-20-too-many-volumes - 2017-03-23-kernel-deaths-incident - 2017-04-03-cluster-full-incident - 2017-05-09-gce-billing - 2017-10-10-hung-nodes - 2017-10-19-course-subscription-canceled - 2018-01-25-helm-chart-upgrade - 2018-01-26-hub-slow-startup - 2018-02-06-hub-db-dir - 2018-02-28-hung-node - 2018-06-11-course-subscription-canceled - 2019-02-25-k8s-api-server-down - 2019-05-01-service-account-leak \ No newline at end of file diff --git a/docs/admins/index.rst b/docs/admins/index.qmd similarity index 100% rename from docs/admins/index.rst rename to docs/admins/index.qmd diff --git a/docs/admins/pre-reqs.qmd b/docs/admins/pre-reqs.qmd new file mode 100644 index 000000000..2a7822f2d --- /dev/null +++ b/docs/admins/pre-reqs.qmd @@ -0,0 +1,48 @@ +--- +title: Pre-requisites +--- + +Smoothly working with the JupyterHubs maintained in this repository has +a number of pre-requisite skills you must possess. The rest of the +documentation assumes you have at least a basic level of these skills, +and know how to get help related to these technologies when necessary. + +## Basic + +These skills let you interact with the repository in a basic manner. +This lets you do most \'self-service\' tasks - such as adding admin +users, libraries, making changes to resource allocation, etc. This +doesn\'t give you any skills to debug things when they break, however. + +1. Basic [git](https://git-scm.com/) & [GitHub](https://github.com) + skills. + + The [Git Book](https://git-scm.com/book/en/v2) & [GitHub + Help](https://help.github.com/) are good resources for this. + +2. Familiarity with [YAML](https://en.wikipedia.org/wiki/YAML) syntax. + +3. Understanding of how packages are installed in the languages we + support. + +4. Rights to merge changes into this repository on GitHub. + +## Full + +In addition to the basic skills, you\'ll need the following skills to +\'fully\' work with this repository. Primarily, you need this to **debug +issues when things break** -since we strive to never have things break +in the same way more than twice. + +1. Knowledge of our tech stack: + 1. [Kubernetes](https://kubernetes.io/) + 2. [Google Cloud](https://cloud.google.com) + 3. [Helm](https://helm.sh) + 4. [Docker](http://docker.com/) + 5. [repo2docker](https://repo2docker.readthedocs.io) + 6. [Jupyter](https://jupyter.org/) + 7. Languages we support: [Python](python.org) & + [R](https://www.r-project.org/) +2. Understanding of our JupyterHub distribution, [Zero to + JupyterHub](http://z2jh.jupyter.org). +3. Full access to the various cloud providers we use. diff --git a/docs/admins/pre-reqs.rst b/docs/admins/pre-reqs.rst deleted file mode 100644 index ca21ba3aa..000000000 --- a/docs/admins/pre-reqs.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. _pre-reqs: - -============== -Pre-requisites -============== - -Smoothly working with the JupyterHubs maintained in this repository has a number -of pre-requisite skills you must possess. The rest of the documentation assumes -you have at least a basic level of these skills, and know how to get help related -to these technologies when necessary. - -Basic -===== - -These skills let you interact with the repository in a basic manner. This lets you -do most 'self-service' tasks - such as adding admin users, libraries, making changes -to resource allocation, etc. This doesn't give you any skills to debug things when -they break, however. - -#. Basic `git `_ & `GitHub `_ skills. - - The `Git Book `_ & `GitHub Help `_ - are good resources for this. - -#. Familiarity with `YAML `_ syntax. - -#. Understanding of how packages are installed in the languages we support. - -#. Rights to merge changes into this repository on GitHub. - -Full -==== - -In addition to the basic skills, you'll need the following skills to 'fully' work -with this repository. Primarily, you need this to **debug issues when things break** - -since we strive to never have things break in the same way more than twice. - -#. Knowledge of our tech stack: - - #. `Kubernetes `_ - #. `Google Cloud `_ - #. `Helm `_ - #. `Docker `_ - #. `repo2docker `_ - #. `Jupyter `_ - #. Languages we support: `Python `_ & `R `_ - -#. Understanding of our JupyterHub distribution, `Zero to JupyterHub `_. - -#. Full access to the various cloud providers we use. \ No newline at end of file diff --git a/docs/admins/storage.qmd b/docs/admins/storage.qmd new file mode 100644 index 000000000..6063706ee --- /dev/null +++ b/docs/admins/storage.qmd @@ -0,0 +1,85 @@ +--- +title: User home directory storage +--- + +All users on all the hubs get a home directory with persistent storage. + +## Why NFS? + +NFS isn\'t a particularly cloud-native technology. It isn\'t highly +available nor fault tolerant by default, and is a single point of +failure. However, it is currently the best of the alternatives available +for user home directories, and so we use it. + +1. Home directories need to be fully POSIX compliant file systems that + work with minimal edge cases, since this is what most instructional + code assumes. This rules out object-store backed filesystems such as + [s3fs](https://github.com/s3fs-fuse/s3fs-fuse). + +2. Users don\'t usually need guaranteed space or IOPS, so providing + them each a [persistent cloud + disk](https://cloud.google.com/persistent-disk/) gets unnecessarily + expensive - since we are paying for it whether it is used or not. + + When we did use one persistent disk per user, the storage cost + dwarfed everything else by an order of magnitude for no apparent + benefit. + + Attaching cloud disks to user pods also takes on average about 30s + on Google Cloud, and much longer on Azure. NFS mounts pretty + quickly, getting this down to a second or less. + +## NFS Server + +We currently have two approaches to running NFS Servers. + +1. Run a hand-maintained NFS Server with + [ZFS](https://en.wikipedia.org/wiki/ZFS) SSD disks. + + This gives us control over performance, size and most importantly, + server options. We use `anonuid=1000`, so all reads / writes from + the cluster are treated as if they have uid `1000`, which is the uid + all user processes run as. This prevents us from having to muck + about permissions & chowns - particularly since Kubernetes creates + new directories on volumes as root with strict permissions (see + [issue](https://github.com/kubernetes/kubernetes/issues/2630)). + +2. Use a hosted NFS service like [Google Cloud + Filestore](https://cloud.google.com/filestore/). + + We do not have to perform any maintenance if we use this - but we + have no control over the host machine either. + +After running our own NFS server from 2020 through the end of 2022, we +decided to move wholesale to [Google Cloud +Filestore](https://cloud.google.com/filestore/). This was mostly due to +NFS daemon stability issues, which caused many outages and impacted +thousands of our users and courses. + +Currently each hub has it\'s own filestore instance, except for a few +small courses that share one. This has proven to be much more stable and +able to handle the load. + +## Home directory paths + +Each user on each hub gets their own directory on the server that gets +treated as their home directory. The staging & prod servers share home +directory paths, so users get the same home directories on both. + +For most hubs, the user\'s home directory path relative to the exported +filestore share is +`-filestore///home/`. + +## NFS Client + +We currently have two approaches for mounting the user\'s home directory +into each user\'s pod. + +1. Mount the NFS Share once per node to a well known location, and use + [hostpath](https://kubernetes.io/docs/concepts/storage/volumes/#hostpath) + volumes with a + [subpath](https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath) + on the user pod to mount the correct directory on the user pod. + + This lets us get away with one NFS mount per node, rather than one + per pod. diff --git a/docs/admins/storage.rst b/docs/admins/storage.rst deleted file mode 100644 index 582757e5e..000000000 --- a/docs/admins/storage.rst +++ /dev/null @@ -1,84 +0,0 @@ -.. _topic/storage: - -=========================== -User home directory storage -=========================== - -All users on all the hubs get a home directory with persistent storage. - -Why NFS? -======== - -NFS isn't a particularly cloud-native technology. It isn't highly available -nor fault tolerant by default, and is a single point of failure. However, -it is currently the best of the alternatives available for user home directories, -and so we use it. - -#. Home directories need to be fully POSIX compliant file systems that work - with minimal edge cases, since this is what most instructional code assumes. - This rules out object-store backed filesystems such as `s3fs `_. - -#. Users don't usually need guaranteed space or IOPS, so providing them each - a `persistent cloud disk `_ gets - unnecessarily expensive - since we are paying for it whether it is used or - not. - - When we did use one persistent disk per user, the storage cost - dwarfed everything else by an order of magnitude for no apparent benefit. - - Attaching cloud disks to user pods also takes on average about 30s on - Google Cloud, and much longer on Azure. NFS mounts pretty quickly, getting - this down to a second or less. - - -NFS Server -========== - -We currently have two approaches to running NFS Servers. - -#. Run a hand-maintained NFS Server with `ZFS `_ - SSD disks. - - This gives us control over performance, size and most importantly, server options. - We use ``anonuid=1000``, so all reads / writes from the cluster are treated as if - they have uid ``1000``, which is the uid all user processes run as. This prevents - us from having to muck about permissions & chowns - particularly since Kubernetes - creates new directories on volumes as root with strict permissions (see - `issue `_). - -#. Use a hosted NFS service like `Google Cloud Filestore `_. - - We do not have to perform any maintenance if we use this - but we have no control - over the host machine either. - -After running our own NFS server from 2020 through the end of 2022, we decided to move -wholesale to `Google Cloud Filestore `_. This was -mostly due to NFS daemon stability issues, which caused many outages and impacted thousands -of our users and courses. - -Currently each hub has it's own filestore instance, except for a few small courses that -share one. This has proven to be much more stable and able to handle the load. - -Home directory paths -==================== - -Each user on each hub gets their own directory on the server that gets treated -as their home directory. The staging & prod servers share home directory paths, so -users get the same home directories on both. - -For most hubs, the user's home directory path relative to the exported filestore share -is ``-filestore///home/``. - -NFS Client -========== - -We currently have two approaches for mounting the user's home directory -into each user's pod. - -#. Mount the NFS Share once per node to a well known location, and use - `hostpath `_ - volumes with a `subpath `_ - on the user pod to mount the correct directory on the user pod. - - This lets us get away with one NFS mount per node, rather than one per - pod. diff --git a/docs/admins/structure.qmd b/docs/admins/structure.qmd new file mode 100644 index 000000000..3e3a9cafb --- /dev/null +++ b/docs/admins/structure.qmd @@ -0,0 +1,74 @@ +--- +title: Repository Structure +--- + +## Hub Configuration + +Each hub has a directory under `deployments/` where all configuration +for that particular hub is stored in a standard format. For example, all +the configuration for the primary hub used on campus (*datahub*) is +stored under `deployments/datahub/`. + +### User Image (`image/`) + +The contents of the `image/` directory determine the environment +provided to the user. For example, it controls: + +1. Versions of Python / R / Julia available +2. Libraries installed, and which versions of those are installed +3. Specific config for Jupyter Notebook or IPython + +[repo2docker](https://repo2docker.readthedocs.io/en/latest/) is used to +build the actual user image, so you can use any of the [supported config +files](https://repo2docker.readthedocs.io/en/latest/config_files.html) +to customize the image as you wish. + +### Hub Config (`config/` and `secrets/`) + +All our JupyterHubs are based on [Zero to JupyterHub +(z2jh)](http://z2jh.jupyter.org/). z2jh uses configuration files in +[YAML](https://en.wikipedia.org/wiki/YAML) format to specify exactly how +the hub is configured. For example, it controls: + +1. RAM available per user +2. Admin user lists +3. User storage information +4. Per-class & Per-user RAM overrides (when classes or individuals need + more RAM) +5. Authentication secret keys + +These files are split between files that are visible to everyone +(`config/`) and files that are visible only to a select few illuminati +(`secrets/`). To get access to the secret files, please consult the +illuminati. + +Files are further split into: + +1. `common.yaml` - Configuration common to staging and production + instances of this hub. Most config should be here. +2. `staging.yaml` - Configuration specific to the staging instance of + the hub. +3. `prod.yaml` - Configuration specific to the production instance of + the hub. + +### `hubploy.yaml` + +We use [hubploy](https://github.com/yuvipanda/hubploy) to deploy our +hubs in a repeatable fashion. `hubploy.yaml` contains information +required for hubploy to work - such as cluster name, region, provider, +etc. + +Various secret keys used to authenticate to cloud providers are kept +under `secrets/` and referred to from `hubploy.yaml`. + +## Documentation + +Documentation is under the `docs/` folder, and is generated with the +[sphinx](http://www.sphinx-doc.org/) project. It is written with the +[reStructuredText +(rst)](http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) +format. Documentation is automatically published to + and +. This is performed via a +[webhook](https://github.com/berkeley-dsep-infra/datahub/settings/hooks) +in the github repo. diff --git a/docs/admins/structure.rst b/docs/admins/structure.rst deleted file mode 100644 index 3f4191702..000000000 --- a/docs/admins/structure.rst +++ /dev/null @@ -1,72 +0,0 @@ -.. _structure: - -==================== -Repository Structure -==================== - -Hub Configuration -================= - -Each hub has a directory under ``deployments/`` where all configuration -for that particular hub is stored in a standard format. For example, all -the configuration for the primary hub used on campus (*datahub*) is stored -under ``deployments/datahub/``. - -User Image (``image/``) ------------------------ - -The contents of the ``image/`` directory determine the environment provided -to the user. For example, it controls: - -#. Versions of Python / R / Julia available -#. Libraries installed, and which versions of those are installed -#. Specific config for Jupyter Notebook or IPython - -`repo2docker `_ is used to -build the actual user image, so you can use any of the `supported config files -`_ to customize -the image as you wish. - -.. _structure/config: - -Hub Config (``config/`` and ``secrets/``) ------------------------------------------ - -All our JupyterHubs are based on `Zero to JupyterHub (z2jh) `_. -z2jh uses configuration files in `YAML `_ format -to specify exactly how the hub is configured. For example, it controls: - -#. RAM available per user -#. Admin user lists -#. User storage information -#. Per-class & Per-user RAM overrides (when classes or individuals need more RAM) -#. Authentication secret keys - -These files are split between files that are visible to everyone (``config/``) and -files that are visible only to a select few illuminati (``secrets/``). To get access -to the secret files, please consult the illuminati. - -Files are further split into: - -#. ``common.yaml`` - Configuration common to staging and production instances of this - hub. Most config should be here. -#. ``staging.yaml`` - Configuration specific to the staging instance of the hub. -#. ``prod.yaml`` - Configuration specific to the production instance of the hub. - -``hubploy.yaml`` ----------------- - -We use `hubploy `_ to deploy our hubs in a -repeatable fashion. ``hubploy.yaml`` contains information required for hubploy to -work - such as cluster name, region, provider, etc. - -Various secret keys used to authenticate to cloud providers are kept under ``secrets/`` -and referred to from ``hubploy.yaml``. - -Documentation -============= - -Documentation is under the ``docs/`` folder, and is generated with the `sphinx -`_ project. It is written with the `reStructuredText (rst) -`_ -format. Documentation is automatically published to https://uc-berkeley-jupyterhubs.readthedocs.io/ and https://docs.datahub.berkeley.edu/. This is performed via a `webhook `_ in the github repo. diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index a0b74cf28..000000000 --- a/docs/conf.py +++ /dev/null @@ -1,183 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = 'UC Berkeley JupyterHubs' -copyright = '2019, Division of Data Sciences Technical Staff' -author = 'Division of Data Sciences Technical Staff' - -# The short X.Y version -version = '' -# The full version, including alpha/beta/rc tags -release = '' - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.intersphinx', - 'myst_parser' -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'pydata_sphinx_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'UCBerkeleyJupyterHubsdoc' - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'UCBerkeleyJupyterHubs.tex', 'UC Berkeley JupyterHubs Documentation', - 'Division of Data Sciences Technical Staff', 'manual'), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'ucberkeleyjupyterhubs', 'UC Berkeley JupyterHubs Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'UCBerkeleyJupyterHubs', 'UC Berkeley JupyterHubs Documentation', - author, 'UCBerkeleyJupyterHubs', 'One line description of project.', - 'Miscellaneous'), -] - - -# -- Options for Epub output ------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] - - -# -- Extension configuration ------------------------------------------------- - -# -- Options for intersphinx extension --------------------------------------- - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = { 'python': ('https://docs.python.org/', None) } diff --git a/docs/datahub.svg b/docs/datahub.svg new file mode 100644 index 000000000..58439789f --- /dev/null +++ b/docs/datahub.svg @@ -0,0 +1,19 @@ + + + DATAHUB + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/admins/incidents/2017-02-09-datahub-db-outage-pvc-recreate-script.ipynb b/docs/incidents/2017-02-09-datahub-db-outage-pvc-recreate-script.ipynb similarity index 100% rename from docs/admins/incidents/2017-02-09-datahub-db-outage-pvc-recreate-script.ipynb rename to docs/incidents/2017-02-09-datahub-db-outage-pvc-recreate-script.ipynb diff --git a/docs/admins/incidents/2017-02-09-datahub-db-outage.md b/docs/incidents/2017-02-09-datahub-db-outage.qmd similarity index 99% rename from docs/admins/incidents/2017-02-09-datahub-db-outage.md rename to docs/incidents/2017-02-09-datahub-db-outage.qmd index bc087430f..3d0853c0e 100644 --- a/docs/admins/incidents/2017-02-09-datahub-db-outage.md +++ b/docs/incidents/2017-02-09-datahub-db-outage.qmd @@ -1,4 +1,8 @@ -# 2017-02-09 - JupyterHub db manual overwrite +--- +title: JupyterHub db manual overwrite +date: 2017-02-09 +--- + ## Summary ## Datahub was reportedly down at 1am. Users attempting to log in to datahub were greeted with a proxy error. The hub pod was up but the log was full of sqlite errors. After the hub pod was deleted and a new one came up, students logging in to datahub found their notebooks were missing and their home directories were empty. Once this was fixed, some students still were being logged in as a different particular user. Finally, students with a '.' in their username were still having issues after everyone else was fine. This was all fixed and an all-clear signalled at about 2017-02-09 11:35 AM. diff --git a/docs/admins/incidents/2017-02-24-autoscaler-incident.md b/docs/incidents/2017-02-24-autoscaler-incident.qmd similarity index 97% rename from docs/admins/incidents/2017-02-24-autoscaler-incident.md rename to docs/incidents/2017-02-24-autoscaler-incident.qmd index f36f7ee3e..94ee6383d 100644 --- a/docs/admins/incidents/2017-02-24-autoscaler-incident.md +++ b/docs/incidents/2017-02-24-autoscaler-incident.qmd @@ -1,4 +1,7 @@ -# 2017-02-24 - Custom Autoscaler gonee haywire +--- +title: Custom Autoscaler gonee haywire +date: 2017-02-24 +--- ## Summary ## On the evening of February 24, 2017, a premature version of the Autoscaler script for the Datahub deployment was mistakenly run on the prod cluster, resulting in a large amount of nodes (roughly 30-40) being set as unschedulable for about 20 minutes. Though no information was lost nor service critically disturbed, it was necessary to manually re-enable these nodes to be scheduled. diff --git a/docs/admins/incidents/2017-02-24-proxy-death-incident.md b/docs/incidents/2017-02-24-proxy-death-incident.qmd similarity index 98% rename from docs/admins/incidents/2017-02-24-proxy-death-incident.md rename to docs/incidents/2017-02-24-proxy-death-incident.qmd index a112f6cf9..3932ccdc1 100644 --- a/docs/admins/incidents/2017-02-24-proxy-death-incident.md +++ b/docs/incidents/2017-02-24-proxy-death-incident.qmd @@ -1,4 +1,7 @@ -# 2017-02-24 - Proxy eviction strands user +--- +title: Proxy eviction strands user +date: 2017-02-24 +--- ## Summary ## diff --git a/docs/admins/incidents/2017-03-06-helm-config-image-mismatch.md b/docs/incidents/2017-03-06-helm-config-image-mismatch.qmd similarity index 98% rename from docs/admins/incidents/2017-03-06-helm-config-image-mismatch.md rename to docs/incidents/2017-03-06-helm-config-image-mismatch.qmd index ef2fad76f..8e315fd6c 100644 --- a/docs/admins/incidents/2017-03-06-helm-config-image-mismatch.md +++ b/docs/incidents/2017-03-06-helm-config-image-mismatch.qmd @@ -1,4 +1,7 @@ -# 2017-03-06 - Non-matching hub image tags cause downtime +--- +title: Non-matching hub image tags cause downtime +date: 2017-03-06 +--- ## Summary ## diff --git a/docs/admins/incidents/2017-03-20-too-many-volumes.md b/docs/incidents/2017-03-20-too-many-volumes.qmd similarity index 97% rename from docs/admins/incidents/2017-03-20-too-many-volumes.md rename to docs/incidents/2017-03-20-too-many-volumes.qmd index 9474c8233..f7cea129c 100644 --- a/docs/admins/incidents/2017-03-20-too-many-volumes.md +++ b/docs/incidents/2017-03-20-too-many-volumes.qmd @@ -1,4 +1,7 @@ -# 2017-03-20 - Too many volumes per disk leave students stuck +--- +title: Too many volumes per disk leave students stuck +date: 2017-03-20 +--- ## Summary ## diff --git a/docs/admins/incidents/2017-03-23-kernel-deaths-incident.md b/docs/incidents/2017-03-23-kernel-deaths-incident.qmd similarity index 98% rename from docs/admins/incidents/2017-03-23-kernel-deaths-incident.md rename to docs/incidents/2017-03-23-kernel-deaths-incident.qmd index ab1541a45..050b65050 100644 --- a/docs/admins/incidents/2017-03-23-kernel-deaths-incident.md +++ b/docs/incidents/2017-03-23-kernel-deaths-incident.qmd @@ -1,4 +1,7 @@ -# 2017-03-23 - Weird upstream ipython bug kills kernels +--- +title: Weird upstream ipython bug kills kernels +date: 2017-03-23 +--- ## Summary ## diff --git a/docs/admins/incidents/2017-04-03-cluster-full-incident.md b/docs/incidents/2017-04-03-cluster-full-incident.qmd similarity index 96% rename from docs/admins/incidents/2017-04-03-cluster-full-incident.md rename to docs/incidents/2017-04-03-cluster-full-incident.qmd index 1bc4d768d..f9d9e3fb6 100644 --- a/docs/admins/incidents/2017-04-03-cluster-full-incident.md +++ b/docs/incidents/2017-04-03-cluster-full-incident.qmd @@ -1,4 +1,7 @@ -# 2017-04-03 - Custom autoscaler does not scale up when it should +--- +title: Custom autoscaler does not scale up when it should +date: 2017-04-03 +--- ## Summary diff --git a/docs/admins/incidents/2017-05-09-gce-billing.md b/docs/incidents/2017-05-09-gce-billing.qmd similarity index 97% rename from docs/admins/incidents/2017-05-09-gce-billing.md rename to docs/incidents/2017-05-09-gce-billing.qmd index 355f6deb2..5c0e5a05f 100644 --- a/docs/admins/incidents/2017-05-09-gce-billing.md +++ b/docs/incidents/2017-05-09-gce-billing.qmd @@ -1,4 +1,7 @@ -# 2017-05-09 - Oops we forgot to pay the bill +--- +title: Oops we forgot to pay the bill +date: 2017-05-09 +--- ## Summary diff --git a/docs/admins/incidents/2017-10-10-hung-nodes.md b/docs/incidents/2017-10-10-hung-nodes.qmd similarity index 98% rename from docs/admins/incidents/2017-10-10-hung-nodes.md rename to docs/incidents/2017-10-10-hung-nodes.qmd index 0f12402a2..7d7dc183e 100644 --- a/docs/admins/incidents/2017-10-10-hung-nodes.md +++ b/docs/incidents/2017-10-10-hung-nodes.qmd @@ -1,4 +1,7 @@ -# 2017-10-10 - Docker dies on a few Azure nodes +--- +title: Docker dies on a few Azure nodes +date: 2017-10-10 +--- ## Summary diff --git a/docs/admins/incidents/2017-10-19-course-subscription-canceled.md b/docs/incidents/2017-10-19-course-subscription-canceled.qmd similarity index 97% rename from docs/admins/incidents/2017-10-19-course-subscription-canceled.md rename to docs/incidents/2017-10-19-course-subscription-canceled.qmd index 76ff9b275..f2e1f6e2b 100644 --- a/docs/admins/incidents/2017-10-19-course-subscription-canceled.md +++ b/docs/incidents/2017-10-19-course-subscription-canceled.qmd @@ -1,4 +1,7 @@ -# 2017-10-19 - Billing confusion with Azure portal causes summer hub to be lost +--- +title: Billing confusion with Azure portal causes summer hub to be lost +date: 2017-10-19 +--- ## Summary diff --git a/docs/admins/incidents/2018-01-25-helm-chart-upgrade.md b/docs/incidents/2018-01-25-helm-chart-upgrade.qmd similarity index 96% rename from docs/admins/incidents/2018-01-25-helm-chart-upgrade.md rename to docs/incidents/2018-01-25-helm-chart-upgrade.qmd index 4b24931dd..f3407ee56 100644 --- a/docs/admins/incidents/2018-01-25-helm-chart-upgrade.md +++ b/docs/incidents/2018-01-25-helm-chart-upgrade.qmd @@ -1,4 +1,7 @@ -# 2018-01-25 - Accidental merge to prod brings things down +--- +title: Accidental merge to prod brings things down +date: 2018-01-25 +--- ## Summary diff --git a/docs/admins/incidents/2018-01-26-hub-slow-startup.md b/docs/incidents/2018-01-26-hub-slow-startup.qmd similarity index 96% rename from docs/admins/incidents/2018-01-26-hub-slow-startup.md rename to docs/incidents/2018-01-26-hub-slow-startup.qmd index 5a6d8d79d..80f063822 100644 --- a/docs/admins/incidents/2018-01-26-hub-slow-startup.md +++ b/docs/incidents/2018-01-26-hub-slow-startup.qmd @@ -1,4 +1,7 @@ -# 2018-01-26 - Hub starts up very slow, causing outage for users +--- +title: Hub starts up very slow, causing outage for users +date: 2018-01-26 +--- ## Summary diff --git a/docs/admins/incidents/2018-02-06-hub-db-dir.md b/docs/incidents/2018-02-06-hub-db-dir.qmd similarity index 94% rename from docs/admins/incidents/2018-02-06-hub-db-dir.md rename to docs/incidents/2018-02-06-hub-db-dir.qmd index 886cb9afd..120112197 100644 --- a/docs/admins/incidents/2018-02-06-hub-db-dir.md +++ b/docs/incidents/2018-02-06-hub-db-dir.qmd @@ -1,4 +1,7 @@ -# 2018-02-06 - Azure PD refuses to detach, causing downtime for data100 +--- +title: Azure PD refuses to detach, causing downtime for data100 +date: 2018-02-06 +--- ## Summary diff --git a/docs/admins/incidents/2018-02-28-hung-node.md b/docs/incidents/2018-02-28-hung-node.qmd similarity index 94% rename from docs/admins/incidents/2018-02-28-hung-node.md rename to docs/incidents/2018-02-28-hung-node.qmd index 6cd8407e9..c11e5bc7b 100644 --- a/docs/admins/incidents/2018-02-28-hung-node.md +++ b/docs/incidents/2018-02-28-hung-node.qmd @@ -1,4 +1,8 @@ -# 2018-02-28 - A node hangs, causing a subset of users to report issues +--- +title: A node hangs, causing a subset of users to report issues +date: 2018-02-28 +--- + ## Summary On February 28, 2018, a handful of users reported on piazza that there servers wouldn't start. It was determined that all problematic servers were running on the same node. After the node was cordoned and rebooted, the student servers were able to start properly. diff --git a/docs/admins/incidents/2018-06-11-course-subscription-canceled.md b/docs/incidents/2018-06-11-course-subscription-canceled.qmd similarity index 97% rename from docs/admins/incidents/2018-06-11-course-subscription-canceled.md rename to docs/incidents/2018-06-11-course-subscription-canceled.qmd index 4045429d1..1af98312d 100644 --- a/docs/admins/incidents/2018-06-11-course-subscription-canceled.md +++ b/docs/incidents/2018-06-11-course-subscription-canceled.qmd @@ -1,4 +1,7 @@ -# 2018-06-11 - Azure billing issue causes downtime +--- +title: Azure billing issue causes downtime +date: 2018-06-11 +--- ## Summary diff --git a/docs/admins/incidents/2019-02-25-k8s-api-server-down.md b/docs/incidents/2019-02-25-k8s-api-server-down.qmd similarity index 97% rename from docs/admins/incidents/2019-02-25-k8s-api-server-down.md rename to docs/incidents/2019-02-25-k8s-api-server-down.qmd index 09c0b1392..d40828b69 100644 --- a/docs/admins/incidents/2019-02-25-k8s-api-server-down.md +++ b/docs/incidents/2019-02-25-k8s-api-server-down.qmd @@ -1,4 +1,7 @@ -# 2019-02-25 - Azure Kubernetes API Server outage causes downtime +--- +title: Azure Kubernetes API Server outage causes downtime +date: 2019-02-25 +--- ## Summary diff --git a/docs/admins/incidents/2019-05-01-service-account-leak.md b/docs/incidents/2019-05-01-service-account-leak.qmd similarity index 95% rename from docs/admins/incidents/2019-05-01-service-account-leak.md rename to docs/incidents/2019-05-01-service-account-leak.qmd index fb8f6e49f..d66bd7903 100644 --- a/docs/admins/incidents/2019-05-01-service-account-leak.md +++ b/docs/incidents/2019-05-01-service-account-leak.qmd @@ -1,4 +1,7 @@ -# 2019-05-01 - Service Account key leak incident +--- +title: Service Account key leak incident +date: 2019-05-01 +--- ## Summary diff --git a/docs/admins/incidents/2022-01-20-package-dependency-upgrade-incident.md b/docs/incidents/2022-01-20-package-dependency-upgrade-incident.qmd similarity index 90% rename from docs/admins/incidents/2022-01-20-package-dependency-upgrade-incident.md rename to docs/incidents/2022-01-20-package-dependency-upgrade-incident.qmd index bbe273ebd..5b3e0a525 100644 --- a/docs/admins/incidents/2022-01-20-package-dependency-upgrade-incident.md +++ b/docs/incidents/2022-01-20-package-dependency-upgrade-incident.qmd @@ -1,16 +1,13 @@ --- -name: "\U0001F4DD Hub Incident" -about: "Report an incident on our running hub infrastructure." -title: "[Incident] {{ Hubs throwing 505 errors }}" -labels: ["type: Hub Incident", "support"] -assignees: "" +title: Hubs throwing 505 errors +date: 2022-01-20 --- -# Summary +## Summary [PR 1](https://github.com/berkeley-dsep-infra/datahub/pull/3161) and [PR 2](https://github.com/berkeley-dsep-infra/datahub/pull/3164/commits/a3fc71d5a68b030cda91029b5dbb6c01c0eec8fe) were merged to prod between 2 AM and 2.30 AM PST on 1/20. Difference due to the commits can be viewed [here](https://github.com/berkeley-dsep-infra/datahub/pull/3151/files#diff-72ab2727eb8dffad68933fd8e624ef3126cc0a107685c3f0e16fcee62fc77c76) -Due to these changes, image rebuild happened which broke multiple hubs which used that image including Datahub, ISchool, R, Data 100 and Data 140 hubs. +Due to these changes, image rebuild happened which broke multiple hubs which used that image including Datahub, ISchool, R, Data 100 and Data 140 hubs. One of the dependenices highlighted as part of the image build had an upgrade which resulted in R hub throwing 505 error and Data 100/140 hub throwing "Error starting Kernel". [Yuvi to fill in the right technical information] @@ -24,14 +21,14 @@ Quick summary of the problem. Update this section as we learn more, answering: - what went wrong and how we fixed it. --> -- R Hub was not accessible for about 6 hours. Issue affected 10+ Stat 20 GSIs planning for their first class of the semester (catering to the needs of 600+ students). Hub went down for few minutes during the instruction. +- R Hub was not accessible for about 6 hours. Issue affected 10+ Stat 20 GSIs planning for their first class of the semester (catering to the needs of 600+ students). Hub went down for few minutes during the instruction. - Prob 140 hub was not available till 12.15 AM PST - Data 100 hub was not available till 12.33 AM. Thankfully, assignments were not due till friday (1/21) - Few users in Ischool were affected as they could not access R Studio ## Hub information -- Hub URL: {{(https://r.datahub.berkeley.edu/)}} & most other hubs highlighted above +- Hub URL: https://r.datahub.berkeley.edu/ and most other hubs highlighted above ## Timeline (if relevant) @@ -40,7 +37,7 @@ Quick summary of the problem. Update this section as we learn more, answering: ### {{ 06:10 }} -Andrew Bray (Stat 20 instructor) raised a [github issue](https://github.com/berkeley-dsep-infra/datahub/issues/3166) around 5.45 AM PST. +Andrew Bray (Stat 20 instructor) raised a [github issue](https://github.com/berkeley-dsep-infra/datahub/issues/3166) around 5.45 AM PST. ### {{ 07:45 }} @@ -77,13 +74,13 @@ They should focus on the knowledge we've gained and any improvements we should t Things that could have gone better. Ideally these should result in concrete action items that have GitHub issues created for them and linked to under -Action items. +Action items. ## Where we got lucky These are good things that happened to us but not because we had planned for them. -- Yuvi was awake at the time when issue was reported and was able to fix the issues immediately. +- Yuvi was awake at the time when issue was reported and was able to fix the issues immediately. - Classes using hubs were not completely affected due to this outage (Data 100 did not have assignments due till 1/21 and Stat 20 had few mins of outage during instruction) ## Action items @@ -94,7 +91,7 @@ These are only sample subheadings. Every action item should have a GitHub issue ### Process/Policy improvements 1. {{[Develop manual testing process](https://github.com/berkeley-dsep-infra/datahub/issues/2953) whenever a PR gets merged to staging of the major hubs (till automated test suites are written)}} [link to github issue](https://github.com/berkeley-dsep-infra/datahub/issues/2953)] -2. Develop a policy around when to create a new hub and what type of changes get deployed to Datahub! +2. Develop a policy around when to create a new hub and what type of changes get deployed to Datahub! ### Documentation improvements @@ -108,9 +105,9 @@ These are only sample subheadings. Every action item should have a GitHub issue 3. {{ Investigate the reason why pager duty did not throw an alert for 5xx errors when the hubs went down. Fix the alerting mechanism so that they notify all kind of errors }} [link to github issue] 4. {{ Adding R Studio as part of Repo2Docker}} [link to github issue] -# Actions +## Actions - [ ] Incident has been dealt with or is over - [ ] Sections above are filled out - [ ] Incident title and after-action report is cleaned up -- [ ] All actionable items above have linked GitHub Issues \ No newline at end of file +- [ ] All actionable items above have linked GitHub Issues diff --git a/docs/admins/incidents/2024-core-node-incidents.md b/docs/incidents/2024-core-node-incidents.qmd similarity index 93% rename from docs/admins/incidents/2024-core-node-incidents.md rename to docs/incidents/2024-core-node-incidents.qmd index a1ae3b170..d2febd165 100644 --- a/docs/admins/incidents/2024-core-node-incidents.md +++ b/docs/incidents/2024-core-node-incidents.qmd @@ -1,12 +1,9 @@ --- -name: "Core node incidents (Feb-Mar 2024)" -about: "Our core node pool and the proxies located there had a series of issues." -title: "[Incident] Core nodes being autoscaled, configurable HTTP proxy crashes" -labels: ["type: Hub Incident", "support"] -assignees: "@shaneknapp" +title: Core nodes being autoscaled, configurable HTTP proxy crashes +date: 2024-02-01 --- -# Summary +## Summary Over the past couple of years, all of our production hubs have been having persistent issues with our core nodes having major load spikes during 'peak' usage and the impacted node (which hosts all of our hub and proxy pods -- not user pods) crashing. This would then impact every hub, causing all users to see 503 http errors until a new node finishing spinning up. We also suspect that the 'white screen' issue some users see after logging in is related to this. @@ -18,7 +15,7 @@ We have spent much time working to debug and track this, including with our frie After some back and forth w/the upstream maintainers, we received a [forked version](https://github.com/berkeley-dsep-infra/datahub/pull/5501) of the proxy to test. -During this testing, we triggered some user-facing downtime, as well as the proxy itself crashing and causing small outages. +During this testing, we triggered some user-facing downtime, as well as the proxy itself crashing and causing small outages. Another (unrelated) issue that impacted users was that [GKE](https://cloud.google.com/kubernetes-engine) was autoscaling our core pool (where the hub and proxy pods run) node to zero. Since it takes about 10-15m for a new node to spin up, all hubs were inaccessible until the new node was deployed. @@ -62,7 +59,7 @@ proxy ram 800Mi (steady) spike on proxy — cpu 181%, mem 1.06Gi --> 1.86Gi ### 16:05:53 -chp healthz readiness probe failure +chp healthz readiness probe failure ### 16:05:56 chp/javascript runs out of heap “Ineffective mark-compass near heap limit Allocation Failed” @@ -98,7 +95,7 @@ chp restarts (no heap error) 5.7K 503 errors ### 16:54:15 - 17:15:31 -300 users (slowly descreasing), 3x chp “uncaught exception: write EPIPE”, intermittent 503 errors in spikes of 30, 60, 150, hub latency 2.5sec +300 users (slowly descreasing), 3x chp “uncaught exception: write EPIPE”, intermittent 503 errors in spikes of 30, 60, 150, hub latency 2.5sec ### 18:47:19 - 18:58:10 ~120 users (constant), 3x chp “uncaught exception: write EPIPE”, intermittent 503 errors in spikes of 30, 60, hub latency 3sec @@ -136,7 +133,7 @@ None. 2. Deploy a new core pool with the same RAM and more CPU (Jira DH-259). 3. Spin up a dev hub and figure out how to use [hubtraf](https://github.com/yuvipanda/hubtraf) to simulate a large number of users doing work. -# Actions +## Actions - [x] Incident has been dealt with or is over - [x] Sections above are filled out diff --git a/docs/incidents/index.qmd b/docs/incidents/index.qmd new file mode 100644 index 000000000..ceee28315 --- /dev/null +++ b/docs/incidents/index.qmd @@ -0,0 +1,27 @@ +--- +title: Incident Reports +listing: + type: table + fields: [date, title] + sort: "date" + contents: + - 2017-02-09-datahub-db-outage.qmd + - 2017-02-24-autoscaler-incident.qmd + - 2017-02-24-proxy-death-incident.qmd + - 2017-03-06-helm-config-image-mismatch.qmd + - 2017-03-20-too-many-volumes.qmd + - 2017-03-23-kernel-deaths-incident.qmd + - 2017-04-03-cluster-full-incident.qmd + - 2017-05-09-gce-billing.qmd + - 2017-10-10-hung-nodes.qmd + - 2017-10-19-course-subscription-canceled.qmd + - 2018-01-25-helm-chart-upgrade.qmd + - 2018-01-26-hub-slow-startup.qmd + - 2018-02-06-hub-db-dir.qmd + - 2018-02-28-hung-node.qmd + - 2018-06-11-course-subscription-canceled.qmd + - 2019-02-25-k8s-api-server-down.qmd + - 2019-05-01-service-account-leak.qmd + - 2022-01-20-package-dependency-upgrade-incident.qmd + - 2024-core-node-incidents.qmd +--- diff --git a/docs/index.qmd b/docs/index.qmd new file mode 100644 index 000000000..d760402f1 --- /dev/null +++ b/docs/index.qmd @@ -0,0 +1,7 @@ +--- +title: UC Berkeley DataHub Documentation +--- + +This repository contains configuration and documentation (including +policies) for the many JupyterHubs used by various organizations in UC +Berkeley. diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 02bbbd7ec..000000000 --- a/docs/index.rst +++ /dev/null @@ -1,37 +0,0 @@ -========================= -UC Berkeley's JupyterHubs -========================= - -This repository contains configuration and documentation (including policies) for the many -JupyterHubs used by various organizations in UC Berkeley. - -Using DataHub -============= - -.. toctree:: - :titlesonly: - :maxdepth: 2 - - users/index - -Modifying DataHub to fit your needs -=================================== - -Our infrastructure can serve the diverse needs of our students only if it -is built by a diverse array of people. - -.. toctree:: - :titlesonly: - :maxdepth: 2 - - admins/index - admins/howto/index - -DataHub Policies -================ - -.. toctree:: - :titlesonly: - :maxdepth: 2 - - policy/index.md diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 27f573b87..000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/policy/create_policy.md b/docs/policy/create_policy.qmd similarity index 100% rename from docs/policy/create_policy.md rename to docs/policy/create_policy.qmd diff --git a/docs/policy/index.md b/docs/policy/index.qmd similarity index 50% rename from docs/policy/index.md rename to docs/policy/index.qmd index 70d97bd4a..bada3f723 100644 --- a/docs/policy/index.md +++ b/docs/policy/index.qmd @@ -1,12 +1,5 @@ -# Datahub Policy Documents +--- +title: Datahub Policy Documents +--- The primary objective of this documentation is to codify varied policies of the infrastructure team to operate the many Jupyterhubs deployed in UC Berkeley. - -```{toctree} -:maxdepth: 2 -:caption: Policies -create_policy.md -policy_create_hubs.md -policy_deploy_mainhubs.md -principles.md -``` \ No newline at end of file diff --git a/docs/policy/policy_create_hubs.md b/docs/policy/policy_create_hubs.qmd similarity index 93% rename from docs/policy/policy_create_hubs.md rename to docs/policy/policy_create_hubs.qmd index 4f74ec10f..4b61f0a73 100644 --- a/docs/policy/policy_create_hubs.md +++ b/docs/policy/policy_create_hubs.qmd @@ -1,5 +1,6 @@ - -# Policy considerations for creating a new hub +--- +title: Policy considerations for creating a new hub +--- We have lots of prior experience creating 10+ new hubs catering to the diverse instructional needs of the campus audience. Our decisions to create a new hub were made with a lot of intuition about solving instructors' immediate needs effectively. The objective of this policy document is to codify these heuristics used while creating a new hub. Our policy should guide our decisions with regard to creating new hubs in the future. @@ -9,4 +10,4 @@ Below are 5 key criteria (listed in the order of importance) to be considered wh - **Testbed**: Testbed for deploying new features which post maturity can be enabled across other major hubs (Eg: Stat 159 hub) - **Large Computation**: Course is computationally intensive requiring a large amount of CPU/Memory because of the nature of the use case (Eg: biology hub) or larger user base with 300+ students (Eg: Data 8 and Data 100 hubs). These courses may require additional compute through calendar-based scheduling. - **Admin Access**: Course has undergrad students acting as GSIs while simultaneously requiring admin access (Eg: Data8 hub) -- **Organizational Reasons**: Hub is created for organizational/strategic reasons to build institutional buy-in from specific departments (Eg: Public Health and ISchool Hubs) and/or evangelize a specific service (Eg: Julia or R hub) \ No newline at end of file +- **Organizational Reasons**: Hub is created for organizational/strategic reasons to build institutional buy-in from specific departments (Eg: Public Health and ISchool Hubs) and/or evangelize a specific service (Eg: Julia or R hub) diff --git a/docs/policy/policy_deploy_mainhubs.md b/docs/policy/policy_deploy_mainhubs.qmd similarity index 100% rename from docs/policy/policy_deploy_mainhubs.md rename to docs/policy/policy_deploy_mainhubs.qmd diff --git a/docs/policy/principles.md b/docs/policy/principles.qmd similarity index 100% rename from docs/policy/principles.md rename to docs/policy/principles.qmd diff --git a/docs/policy/storage-retention.qmd b/docs/policy/storage-retention.qmd new file mode 100644 index 000000000..0e778db7e --- /dev/null +++ b/docs/policy/storage-retention.qmd @@ -0,0 +1,45 @@ +--- +title: Storage Retention Policy +--- + +## Policy + +### Criteria + +No non-hidden files in the user\'s home directory that have been +modified in the last 6 months. + +### Archival + +1. Zip the whole home directory +2. Upload it to Google drive of a + [SPA](https://calnetweb.berkeley.edu/calnet-departments/special-purpose-accounts-spa) + created for this purpose +3. Share the ZIP file in the Google Drive with the user. + +## Rationale + +Today (6 Feb 2020), we have 18,623 home directories in datahub. Most of +these users used datahub in previous semesters, have not logged in for a +long time, and will probably never log in again. This costs us a lot of +money in disk space - we will have to forever expand disk space. + +By cleaning it up after 6 months of non-usage, we will not affect any +current users - just folks who haven\'t logged in for a long time. +Archiving the contents would make sure people still have access to their +old work, without leaving the burden of maintaining it forever on us. + +## Why Google Drive? + +We can also perform access control easily with Google Drive. + +## Alternatives + +1. Email it to our users. This will most likely be rejected by most + mail servers as the home directory will be too big an attachment +2. Put it in [Google Cloud Nearline + storage](https://cloud.google.com/storage/archival/), build a token + based access control mechanism on top, and email this link to the + users. We will need to probably clean this up every 18 months or so + for cost reasons. This is the viable alternative, if we decide to + not use Google Drive diff --git a/docs/policy/storage-retention.rst b/docs/policy/storage-retention.rst deleted file mode 100644 index d17787727..000000000 --- a/docs/policy/storage-retention.rst +++ /dev/null @@ -1,52 +0,0 @@ -.. _topic/storage-retention: - -======================== -Storage Retention Policy -======================== - -Policy -====== - -Criteria --------- - -No non-hidden files in the user's home directory that have been modified in the last 6 months. - -Archival --------- - -1. Zip the whole home directory -2. Upload it to Google drive of a - `SPA `_ - created for this purpose -3. Share the ZIP file in the Google Drive with the user. - -Rationale -========= - -Today (6 Feb 2020), we have 18,623 home directories in datahub. Most of -these users used datahub in previous semesters, have not logged in -for a long time, and will probably never log in again. This costs us -a lot of money in disk space - we will have to forever expand disk space. - -By cleaning it up after 6 months of non-usage, we will not affect any -current users - just folks who haven't logged in for a long time. Archiving -the contents would make sure people still have access to their old work, -without leaving the burden of maintaining it forever on us. - -Why Google Drive? -================= - -We can also perform access control easily with Google Drive. - -Alternatives -============ - -#. Email it to our users. This will most likely be rejected by most - mail servers as the home directory will be too big an attachment - -#. Put it in `Google Cloud Nearline storage `_, - build a token based access control mechanism on top, and email this - link to the users. We will need to probably clean this up every 18 months - or so for cost reasons. This is the viable alternative, if we decide to - not use Google Drive \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 966d4c259..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -myst_parser[sphinx] -pydata-sphinx-theme diff --git a/docs/services/google-sheets.rst b/docs/services/google-sheets.rst deleted file mode 100644 index ce01b2377..000000000 --- a/docs/services/google-sheets.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. _services/google-sheets: - -================================== -Reading Google Sheets from DataHub -================================== - -Available in: DataHub - -We provision and make available credentials for a -`service account `_ -that can be used to provide readonly access to Google Sheets. This is useful in -pedagogical situations where data is read from Google Sheets, particularly with -the `gspread `_ library. - -The entire contents of the JSON formatted service account key is available as an -environment variable ``GOOGLE_SHEETS_READONLY_KEY``. You can use this to read -publicly available Google Sheet documents. - -The service account has no implicit permissions, and can be found under -``singleuser.extraEnv.GOOGLE_SHEETS_READONLY_KEY`` in ``datahub/secrets/staging.yaml`` and -``datahub/secrets/prod.yaml``. - -``gspread`` sample code -======================= - -The following sample code reads a sheet from a URL given to it, and prints -the contents. - -.. code:: python - - import gspread - import os - import json - from oauth2client.service_account import ServiceAccountCredentials - - # Authenticate to Google - scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] - creds = ServiceAccountCredentials.from_json_keyfile_dict(json.loads(os.environ['GOOGLE_SHEETS_READONLY_KEY']), scope) - gc = gspread.authorize(creds) - - # Pick URL of Google Sheet to open - url = 'https://docs.google.com/spreadsheets/d/1SVRsQZWlzw9lV0MT3pWlha_VCVxWovqvu-7cb3feb4k/edit#gid=0' - - # Open the Google Sheet, and print contents of sheet 1 - sheet = gc.open_by_url(url) - print(sheet.sheet1.get_all_records()) - - -``gspread-pandas`` sample code -============================== - -The `gspread-pandas `_ library helps get data from -Google Sheets into a `pandas `_ dataframe. - - -.. code:: python - - from gspread_pandas.client import Spread - import os - import json - from oauth2client.service_account import ServiceAccountCredentials - - # Authenticate to Google - scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'] - creds = ServiceAccountCredentials.from_json_keyfile_dict(json.loads(os.environ['GOOGLE_SHEETS_READONLY_KEY']), scope) - - # Pick URL of Google Sheet to open - url = 'https://docs.google.com/spreadsheets/d/1SVRsQZWlzw9lV0MT3pWlha_VCVxWovqvu-7cb3feb4k/edit#gid=0' - - # Open the Google Sheet, and print contents of sheet 1 as a dataframe - spread = Spread(url, creds=creds) - sheet_df = spread.sheet_to_df(sheet='sheet1') - print(sheet_df) diff --git a/docs/users/authentication.md b/docs/users/authentication.qmd similarity index 83% rename from docs/users/authentication.md rename to docs/users/authentication.qmd index 4f8804c50..e24d3aec1 100644 --- a/docs/users/authentication.md +++ b/docs/users/authentication.qmd @@ -1,9 +1,10 @@ -# User Authentication +--- +title: User Authentication +--- UC Berkeley uses a [Canvas](https://www.instructure.com/canvas/) instance, called [bcourses.berkeley.edu](https://bcourses.berkeley.edu). Almost -all our hubs use this for authentication, although not all yet -([issue](https://github.com/berkeley-dsep-infra/datahub/issues/1574))). +all our hubs use this for authentication, although [not all yet](https://github.com/berkeley-dsep-infra/datahub/issues/1574). ## Who has access? @@ -28,4 +29,4 @@ and hence to all the JupyterHubs. If you can log in to [bcourses](https://bcourses.berkeley.edu) but not to any of the JupyterHubs, please contact us. -If you can not log in to bcourses, please [contact bcourses support](https://dls.berkeley.edu/services/bcourses-0) \ No newline at end of file +If you can not log in to bcourses, please [contact bcourses support](https://dls.berkeley.edu/services/bcourses-0) diff --git a/docs/users/features.qmd b/docs/users/features.qmd new file mode 100644 index 000000000..8092a5bfc --- /dev/null +++ b/docs/users/features.qmd @@ -0,0 +1,79 @@ +--- +title: Features +--- + +This page lists the various environments, applications, and tools we offer on +DataHub. Not all those listed here are available on all hubs, but we can easily enable them. + +## Programming Languages + +We support the usual suspects - Python, R, and Julia. However, Jupyter and +other applications can support many more. If you would like to use a +different, open-source programming language, contact us. + +## Applications + +Our diverse user population has diverse needs, so we offer many different +user interfaces for instructors to choose from. + +### JupyterLab + +![Do complex layouts with JupyterLab](images/jupyterlab.png) + +[JupyterLab](https://github.com/jupyterlab/jupyterlab) is a more modern and customizable version of the classic Jupyter notebook from the Jupyter project. +Most of our classes use JupyterLab. + + +### Jupyter Notebook (Classic) + +This familiar interface is used for most of our introductory classes. It is document oriented, no-frills, and well known by a lot of people. + +### RStudio + +![RStudio Screenshot](images/rstudio.png) + +We want to provide first class support for teaching with R, which means +providing strong support for [RStudio](https://rstudio.com). This also includes support for running Shiny applications. + +Try RStudio [on DataHub](https://r.datahub.berkeley.edu) with your berkeley.edu account, or [on Binder](https://mybinder.org/v2/gh/rocker-org/binder/master?urlpath=rstudio) without a berkeley.edu account. + +### Remote Desktop + +![Do image processing with qt](images/desktop.png) + +Sometimes, you just need to use something that requires a full desktop +environment to run. Instead of trying to get students to install things +locally, we offer a full fledged Linux Desktop environment they can +access from inside their browser! This is just a different 'UI' on the +same infrastructure as the notebook environment, so they all use the +same libraries and home directories. + +Try remote desktop [on EECS DataHub](https://eecs.datahub.berkeley.edu/hub/user-redirect/desktop) with your berkeley.edu account, or [on Binder](https://mybinder.org/v2/gh/yuvipanda/jupyter-desktop-server/master?urlpath=desktop) without a berkeley.edu account. + +### Visual Studio Code + +![Compile C with vscode](images/vscode.png) + +Sometimes you *just* want an IDE, not a notebook environment. We are experimenting +with a hosted, web version of the popular Visual Studio Code editor, to +see if it would be useful for teaching more traditional CS classes. + +Try VS Code [on EECS DataHub](https://eecs.datahub.berkeley.edu/hub/user-redirect/vscode/) with your berkeley.edu account, or [on Binder](https://mybinder.org/v2/gh/betatim/vscode-binder/master?urlpath=lab) without a berkeley.edu account. + +### Other Web Applications + +We can make many web based applications work on a hub. Contact us and we'll see what we can do! + +### Postgresql + +Some of our classes require using real databases to teach. We +now experimentally offer a [postgresql](https://www.postgresql.org/) +server for each user on the [data100 hub](https://data100.datahub.berkeley.edu). + +The data does not persist right now, but we can turn that on whenever +needed. + +## More? + +We want to find solution to your interesting problems, so please bring us +your interesting problems. 😁 diff --git a/docs/users/hubs.qmd b/docs/users/hubs.qmd new file mode 100644 index 000000000..e7d558ad1 --- /dev/null +++ b/docs/users/hubs.qmd @@ -0,0 +1,70 @@ +--- +title: JupyterHubs in this repository +--- + +## DataHub + +[datahub.berkeley.edu](https://datahub.berkeley.edu) is the \'main\' +JupyterHub for use on UC Berkeley campus. It\'s the largest and most +active hub. It has many Python & R packages installed. + +It runs on [Google Cloud Platform](https://cloud.google.com) in the +`ucb-datahub-2018` project. You can see all config for it under +`deployments/datahub`. + +### Classes + +- The big [data8](http://data8.org/) class. +- Active [connector + courses](https://data.berkeley.edu/education/connectors) +- [Data Science Modules](https://data.berkeley.edu/education/modules) +- [Astro + 128/256](https://astro.berkeley.edu/course-information/3958209-astronomy-data-science-laboratory) + +This hub is also the \'default\' when folks wanna use a hub for a short +period of time for any reason without super specific requirements. + +## Prob140 Hub + +A hub specifically for [prob140](http://prob140.org/). Some of the admin +users on `hubs/datahub`{.interpreted-text role="ref"} are students in +prob140 - this would allow them to see the work of other prob140 +students. Hence, this hub is separate until JupyterHub gains features +around restricting admin use. + +It runs on [Google Cloud Platform](https://cloud.google.com) in the +`ucb-datahub-2018` project. You can see all config for it under +`deployments/prob140`. + +## Data 100 + +This hub is for [Data 100](http://www.ds100.org/) which has a unique +user and grading environment. It runs on [Google Cloud +Platform](https://cloud.google.com) in the `ucb-datahub-2018` account. +You can see all config for it under `deployments/data100`. + +Data100 also has shared folders between staff (professors and GSIs) and +students. Staff, assuming they have been added as admins in +`config/common.yaml`, can see a `shared` and a `shared-readwrite` +folder. Students can only see the `shared` folder, which is read-only. +Anything that gets put in `shared-readwrite` is automatically viewable +in `shared`, but as read-only files. The purpose of this is to be able +to share large data files instead of having one per student. + +## Data 102 + +Data 102 runs on [Google Cloud Platform](https://cloud.google.com) in the +`ucb-datahub-2018` project. You can see all config for it under +`deployments/data102`. + +## Data8X Hub + +A hub for the [data8x course on +EdX](https://www.edx.org/professional-certificate/berkeleyx-foundations-of-data-science). +This hub is open to use by anyone in the world, using [LTI +Authentication](https://github.com/jupyterhub/ltiauthenticator) to +provide login capability from inside EdX. + +It runs on [Google Cloud Platform](https://cloud.google.com) in the +`data8x-scratch` project. You can see all config for it under +`deployments/data8x`. diff --git a/docs/users/hubs.rst b/docs/users/hubs.rst deleted file mode 100644 index 53a2b1279..000000000 --- a/docs/users/hubs.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _hubs: - -============================== -JupyterHubs in this repository -============================== - -.. _hubs/datahub: - -DataHub -======= - -`datahub.berkeley.edu `_ is the 'main' JupyterHub -for use on UC Berkeley campus. It's the largest and most active hub. It has many -Python & R packages installed. - -It runs on `Google Cloud Platform `_ in the ``ucb-datahub-2018`` -project. You can see :ref:`all config ` for it under ``deployments/datahub``. - -Classes -------- - -* The big `data8 `_ class. -* Active `connector courses `_ -* `Data Science Modules `_ -* `Astro 128/256 `_ - -This hub is also the 'default' when folks wanna use a hub for a short period of time for -any reason without super specific requirements. - -Prob140 Hub -=========== - -A hub specifically for `prob140 `_. Some of the admin users -on :ref:`hubs/datahub` are students in prob140 - this would allow them to see -the work of other prob140 students. Hence, this hub is separate until JupyterHub -gains features around restricting admin use. - -It runs on `Google Cloud Platform `_ in the ``ucb-datahub-2018`` -project. You can see :ref:`all config ` for it under ``deployments/prob140``. - -Data 100 -======== - -This hub is for `Data 100 `_ which has a unique -user and grading environment. It runs on `Google Cloud Platform `_ in the ``ucb-datahub-2018`` account. You can see :ref:`all config ` for it under ``deployments/data100``. - -Data100 also has shared folders between staff (professors and GSIs) and students. Staff, assuming they have been added as admins in ``config/common.yaml``, can see a ``shared`` and a ``shared-readwrite`` folder. Students can only see the ``shared`` folder, which is read-only. Anything that gets put in ``shared-readwrite`` is automatically viewable in ``shared``, but as read-only files. The purpose of this is to be able to share large data files instead of having one per student. - -Data 102 -======== - -Data 102 runs on `Google Cloud Platform `_ in the ``ucb-datahub-2018`` project. You can see :ref:`all config ` for it under ``deployments/data102``. - -Data8X Hub -========== - -A hub for the `data8x course on EdX `_. -This hub is open to use by anyone in the world, using `LTI Authentication `_ -to provide login capability from inside EdX. - -It runs on `Google Cloud Platform `_ in the ``data8x-scratch`` -project. You can see :ref:`all config ` for it under ``deployments/data8x``. \ No newline at end of file diff --git a/docs/users/index.qmd b/docs/users/index.qmd new file mode 100644 index 000000000..95bc57bca --- /dev/null +++ b/docs/users/index.qmd @@ -0,0 +1,3 @@ +--- +title: Using DataHub +--- diff --git a/docs/users/index.rst b/docs/users/index.rst deleted file mode 100644 index 8215c1fdd..000000000 --- a/docs/users/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -Using DataHub -============= - -.. toctree:: - :maxdepth: 1 - - services - private-repo - hubs - authentication diff --git a/docs/users/private-repo.md b/docs/users/private-repo.qmd similarity index 98% rename from docs/users/private-repo.md rename to docs/users/private-repo.qmd index 5f41c62cb..66b95354b 100644 --- a/docs/users/private-repo.md +++ b/docs/users/private-repo.qmd @@ -1,4 +1,6 @@ -# Accessing private GitHub repos +--- +title: Accessing private GitHub repos +--- GitHub is used to store class materials (lab notebooks, lecture notebooks, etc), and [nbgitpuller](https://jupyterhub.github.io/nbgitpuller/) is used to distribute it diff --git a/docs/users/services.md b/docs/users/services.md deleted file mode 100644 index 27a469966..000000000 --- a/docs/users/services.md +++ /dev/null @@ -1,96 +0,0 @@ -# Services Offered - -This page lists the various services we offer as part of DataHub. Not all -these will be available on all hubs, but we can easily enable them as -you wish. - -## User Interfaces - -Our diverse user population has diverse needs, so we offer many different -user interfaces for instructors to choose from. - -### Jupyter Notebook (Classic) - -What many people mean when they say 'Jupyter', this familiar interface -is used by default for most of our introductory classes. Document oriented, -no-frills, and well known by a lot of people. - -### RStudio - -![RStudio Screenshot](images/rstudio.png) - -We want to provide first class support for teaching with R, which means -providing strong support for [RStudio](https://rstudio.com). This includes Shiny support. - -Try without berkeley.edu account: [![Launch binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/rocker-org/binder/master?urlpath=rstudio) - -Try with berkeley.edu account: [R DataHub](https://r.datahub.berkeley.edu) - -### JupyterLab - -![Do complex layouts with JupyterLab](images/jupyterlab.png) - -[JupyterLab](https://github.com/jupyterlab/jupyterlab) is a more modern version of the classic Jupyter notebook from -the Jupyter project. It is more customizable and better supports some advanced -use cases. Many of our more advanced classes use this, and we might help -all classes move to this once there is a [simpler document oriented mode available](https://github.com/jupyterlab/jupyterlab/issues/8292) - - -### Linux Desktop (Experimental) - -![Do image processing with qt](images/desktop.png) - -Sometimes, you just need to use something that requires a full desktop -environment to run. Instead of trying to get students to install things -locally, we offer a full fledged Linux Desktop environment they can -access from inside their browser! This is just a different 'UI' on the -same infrastructure as the notebook environment, so they all use the -same libraries and home directories. - -Try without Berkeley.edu account: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/yuvipanda/jupyter-desktop-server/master?urlpath=desktop) - -Try with Berkeley.edu account: [EECS DataHub](https://eecs.datahub.berkeley.edu/hub/user-redirect/desktop) - -### Visual Studio Code (Experimental) - -![Compile C with vscode](images/vscode.png) - -Sometimes you *just* want an IDE, not a notebook environment. We are experimenting -with a hosted, web version of the popular Visual Studio Code editor, to -see if it would be useful for teaching more traditional CS classes. - -Try without Berkeley.edu account: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/betatim/vscode-binder/master?urlpath=lab) - -Try with Berkeley.edu account: [EECS DataHub](https://eecs.datahub.berkeley.edu/hub/user-redirect/vscode/) - -### More? - -If you have a web based environment, we can almost certainly make it run under -a hub. Contact us and we'll see what we can do :) - -## Services - -Sometimes you need something custom to get your class going. Very -very interesting things can happen here, so we're always looking -for new services to add. - -### Postgresql - -Some of our classes require using real databases to teach. We -now experimentally offer a [postgresql](https://www.postgresql.org/) -server for each user on the [data100 hub](https://data100.datahub.berkeley.edu). - -The data does not persist right now, but we can turn that on whenever -needed. - -## Programming languages - -We support the usual suspects - Python, R & Julia. However, there are no limits -to what languages we can actually support, so if you are planning on using -a different (open source) programming language, contact us and we'll set you -up. - -## More? - -We want to find solution to your interesting problems, so please bring us -your interesting problems 😁