Merge pull request #5928 from berkeley-dsep-infra/staging

Merge #5921, #5922, #5924, #5925, #5926, #5927.
berkeley-dsep-infra · Aug 9, 2024 · e86e7d3 · e86e7d3
2 parents e0cc085 + 1ed4b61
commit e86e7d3
Show file tree

Hide file tree

Showing 97 changed files with 2,487 additions and 2,406 deletions.
diff --git a/.github/workflows/quarto-docs.yml b/.github/workflows/quarto-docs.yml
@@ -0,0 +1,27 @@
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - 'docs/**'
+
+name: Quarto Publish
+
+jobs:
+  build-deploy:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Quarto
+        uses: quarto-dev/quarto-actions/setup@v2
+
+      - name: Render and Publish
+        uses: quarto-dev/quarto-actions/publish@v2
+        with:
+          path: ./docs
+          target: gh-pages
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CNAME b/CNAME
@@ -0,0 +1 @@
+docs.datahub.berkeley.edu
diff --git a/deployments/ugr01/config/common.yaml b/deployments/ugr01/config/common.yaml
@@ -9,9 +9,9 @@ jupyterhub:
       nodeSelector:
         hub.jupyter.org/pool-name: core-pool-2024-07-07
   proxy:
-    chp:
-      nodeSelector:
-        hub.jupyter.org/pool-name: core-pool-2024-07-07
+#    chp:
+#      nodeSelector:
+#        hub.jupyter.org/pool-name: core-pool-2024-07-07
     service:
       type: LoadBalancer
 
@@ -48,7 +48,7 @@ jupyterhub:
       # Unset NotebookApp from hub/values. Necessary for recent lab versions.
       JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp"
     nodeSelector:
-      hub.jupyter.org/pool-name: user-ugr01
+      hub.jupyter.org/pool-name: ugr01-pool
     storage:
       type: static
       static:

diff --git a/deployments/ugr01/config/staging.yaml b/deployments/ugr01/config/staging.yaml
@@ -12,11 +12,18 @@ jupyterhub:
   proxy:
     service:
       loadBalancerIP: 34.172.42.174
-  ingress:
-    enabled: true
-    hosts:
-      - ugr01-staging.datahub.berkeley.edu
-    tls:
-      - secretName: tls-cert
-        hosts:
-          - ugr01-staging.datahub.berkeley.edu
+    traefik:
+      extraInitContainers:
+        # This startup delay can help the k8s container network interface setup
+        # network routing of traffic to the pod, which is essential for the ACME
+        # challenge submitted by Traefik on startup to acquire a TLS certificate.
+        #
+        # Sleeping for 7 seconds has been consistently sufficient to avoid issues
+        # in GKE when this workaround was explored initially for GKE.
+        #
+        - name: startup-delay
+          image: busybox:stable
+          command: ["sh", "-c", "sleep 10"]
+    https:
+      hosts:
+        - ugr01-staging.datahub.berkeley.edu
diff --git a/docs/Makefile b/docs/Makefile
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
@@ -0,0 +1,89 @@
+project:
+  type: website
+
+website:
+  navbar:
+    title: "DataHub"
+    search: true
+    logo: datahub.svg
+    background: light
+    tools:
+      - icon: github
+        href: https://github.com/berkeley-dsep-infra/datahub
+    left:
+      - text: "Using DataHub"
+        href: users/features.qmd
+      - text: "Contributing"
+        href: admins/pre-reqs.qmd
+      - text: "Admin Tasks"
+        href: admins/howto/preview-local.qmd
+      - text: "Policy"
+        href: policy/create_policy.qmd
+  page-navigation: true
+  sidebar:
+    style: docked
+    collapse-level: 1
+    contents:
+      - href: index.qmd
+        text: Home
+      - section: "Using DataHub"
+        contents:
+          - users/services.qmd
+          - users/private-repo.qmd
+          - users/hubs.qmd
+          - users/authentication.qmd
+      - section: "Contributing to DataHub"
+        contents:
+          - admins/pre-reqs.qmd
+          - admins/structure.qmd
+          - admins/storage.qmd
+          - admins/cluster-config.qmd
+          - admins/credentials.qmd
+          - section: "Common Administrator Tasks"
+            contents:
+              - admins/howto/preview-local.qmd
+              - admins/howto/dns.qmd
+              - admins/howto/core-pool.qmd
+              - admins/howto/new-hub.qmd
+              - admins/howto/rebuild-hub-image.qmd
+              - admins/howto/rebuild-postgres-image.qmd
+              - admins/howto/new-image.qmd
+              - admins/howto/new-packages.qmd
+              - admins/howto/course-config.qmd
+              - admins/howto/calendar-scaler.qmd
+              - admins/howto/prometheus-grafana.qmd
+              - admins/howto/remove-users-orm.qmd
+              - admins/howto/delete-hub.qmd
+              - admins/howto/clusterswitch.qmd
+              - admins/howto/github-token.qmd
+              - admins/howto/google-sheets.qmd
+          - section: "Hub Deployments"
+            contents:
+              - admins/deployments/datahub.qmd
+              - admins/deployments/stat159.qmd
+      - section: "Policy"
+        contents:
+          - policy/create_policy.qmd
+          - policy/policy_create_hubs.qmd
+          - policy/policy_deploy_mainhubs.qmd
+          - policy/principles.qmd
+      - href: incidents/index.qmd
+        text: "Incident Reports"
+
+format:
+  html:
+    theme: book
+    #title-block-banner: datahub.svg
+    navbar:
+      left:
+        - "UC Berkeley DataHub Documentation"
+      right:
+        - icon: github
+          href: https://github.com/berkeley-dsep-infra/datahub
+    footer:
+      copyright: "2024, Division of Data Sciences Technical Staff"
+      nav:
+        - title: UC Berkeley DataHub
+          url: https://cdss.berkeley.edu/datahub
+        - title: Project Jupyter
+          url: https://jupyter.org
diff --git a/docs/admins/cluster-config.qmd b/docs/admins/cluster-config.qmd
@@ -0,0 +1,204 @@
+---
+title: Kubernetes Cluster Configuration
+---
+
+We use [kubernetes](http://kubernetes.io/) to run our JupyterHubs. It
+has a healthy open source community, managed offerings from multiple
+vendors & a fast pace of development. We can run easily on many
+different cloud providers with similar config by running on top of
+Kubernetes, so it is also our cloud agnostic abstraction layer.
+
+We prefer using a managed Kubernetes service (such as [Google Kubernetes
+Engine](https://cloud.google.com/kubernetes-engine/)). This document
+lays out our preferred cluster configuration on various cloud providers.
+
+## Google Kubernetes Engine
+
+In our experience, Google Kubernetes Engine (GKE) has been the most
+stable, performant, and reliable managed kubernetes service. We prefer
+running on this when possible.
+
+A `gcloud container clusters create` command can succintly express the
+configuration of our kubernetes cluster. The following command
+represents the currently favored configuration.
+
+This creates the GKE cluster. It may host one or more node pools:
+
+``` bash
+gcloud container clusters create \
+     --enable-ip-alias \
+     --enable-autoscaling \
+     --max-nodes=20 --min-nodes=1 \
+     --region=us-central1 --node-locations=us-central1-b \
+     --image-type=cos_containerd \
+     --disk-size=100 --disk-type=pd-balanced \
+     --machine-type=n2-highmem-8 \
+     --cluster-version latest \
+     --no-enable-autoupgrade \
+     --enable-network-policy \
+     --create-subnetwork="" \
+     --tags=hub-cluster \
+     <cluster-name>
+```
+
+Here\'s how we add a node pool to the cluster, beyond the default pool:
+
+``` bash
+gcloud container node-pools create  \
+    --machine-type n2-highmem-8 \
+    --num-nodes 1 \
+    --enable-autoscaling \
+    --min-nodes 1 --max-nodes 20 \
+    --node-labels hub.jupyter.org/pool-name=<pool-name>-pool \
+    --node-taints hub.jupyter.org_dedicated=user:NoSchedule \
+    --region=us-central1 \
+    --image-type=cos_containerd \
+    --disk-size=200 --disk-type=pd-balanced \
+    --no-enable-autoupgrade \
+    --tags=hub-cluster \
+    --cluster=<cluster-name> \
+    user-<pool-name>-<yyyy>-<mm>-<dd>
+```
+
+### IP Aliasing
+
+`--enable-ip-alias` creates [VPC Native
+Clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/alias-ips).
+
+This becomes the default soon, and can be removed once it is the
+default.
+
+### Autoscaling
+
+We use the [kubernetes cluster
+autoscaler](https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler)
+to scale our node count up and down based on demand. It waits until the
+cluster is completely full before triggering creation of a new node -
+but that\'s ok, since new node creation time on GKE is pretty quick.
+
+`--enable-autoscaling` turns the cluster autoscaler on.
+
+`--min-nodes` sets the minimum number of nodes that will be maintained
+regardless of demand. This should ideally be 2, to give us some headroom
+for quick starts without requiring scale ups when the cluster is
+completely empty.
+
+`--max-nodes` sets the maximum number of nodes that the cluster
+autoscaler will use - this sets the maximum number of concurrent users
+we can support. This should be set to a reasonably high number, but not
+too high - to protect against runaway creation of hundreds of VMs that
+might drain all our credits due to accident or security breach.
+
+### Highly available master
+
+The kubernetes cluster\'s master nodes are managed by Google Cloud
+automatically. By default, it is deployed in a non-highly-available
+configuration - only one node. This means that upgrades and master
+configuration changes cause a few minutes of downtime for the kubernetes
+API, causing new user server starts / stops to fail.
+
+We request our cluster masters to have [highly available
+masters](https://cloud.google.com/kubernetes-engine/docs/concepts/regional-clusters)
+with `--region` parameter. This specifies the region where our 3 master
+nodes will be spread across in different zones. It costs us extra, but
+it is totally worth it.
+
+By default, asking for highly available masters also asks for 3x the
+node count, spread across multiple zones. We don\'t want that, since all
+our user pods have in-memory state & can\'t be relocated. Specifying
+`--node-locations` explicitly lets us control how many and which zones
+the nodes are located in.
+
+### Region / Zone selection
+
+We generally use the `us-central1` region and a zone in it for our
+clusters -simply because that is where we have asked for
+[quota](https://cloud.google.com/compute/quotas).
+
+There are regions closer to us, but latency hasn\'t really mattered so
+we are currently still in us-central1. There are also unsubstantiated
+rumors that us-central1 is their biggest data center and hence less
+likely to run out of quota.
+
+### Disk Size
+
+`--disk-size` sets the size of the root disk on all the kubernetes
+nodes. This isn\'t used for any persistent storage such as user home
+directories. It is only used ephemerally for the operations of the
+cluster - primarily storing docker images and other temporary storage.
+We can make this larger if we use a large number of big images, or if we
+want our image pulls to be faster (since disk performance [increases
+with disk size](https://cloud.google.com/compute/docs/disks/performance)
+).
+
+`--disk-type=pd-standard` gives us standard spinning disks, which are
+cheaper. We can also request SSDs instead with `--disk-type=pd-ssd` - it
+is much faster, but also much more expensive. We compromise with
+`--disk-type=pd-balanced`, faster than spinning disks but not as fast as
+ssds all the time.
+
+### Node size
+
+`--machine-type` lets us select how much [RAM and
+CPU](https://cloud.google.com/compute/docs/machine-types) each of our
+nodes have. For non-trivial hubs, we generally pick `n2-highmem-8`, with
+64G of RAM and 8 cores. This is based on the following heuristics:
+
+1.  Students generally are memory limited than CPU limited. In fact,
+    while we have a hard limit on memory use per-user pod, we do not
+    have a CPU limit -it hasn\'t proven necessary.
+2.  We try overprovision clusters by about 2x - so we try to fit about
+    100G of total RAM use in a node with about 50G of RAM. This is
+    accomplished by setting the memory request to be about half of the
+    memory limit on user pods. This leads to massive cost savings, and
+    works out ok.
+3.  There is a kubernetes limit on 100 pods per node.
+
+Based on these heuristics, `n2-highmem-8` seems to be most bang for the
+buck currently. We should revisit this for every cluster creation.
+
+### Cluster version
+
+GKE automatically upgrades cluster masters, so there is generally no
+harm in being on the latest version available.
+
+### Node autoupgrades
+
+When node autoupgrades are enabled, GKE will automatically try to
+upgrade our nodes whenever needed (our GKE version falling off the
+support window, security issues, etc). However, since we run stateful
+workloads, we *disable* this right now so we can do the upgrades
+manually.
+
+### Network Policy
+
+Kubernetes [Network
+Policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
+lets you firewall internal access inside a kubernetes cluster,
+whitelisting only the flows you want. The JupyterHub chart we use
+supports setting up appropriate NetworkPolicy objects it needs, so we
+should turn it on for additional security depth. Note that any extra
+in-cluster services we run *must* have a NetworkPolicy set up for them
+to work reliabliy.
+
+### Subnetwork
+
+We put each cluster in its own subnetwork, since *seems* to be a limit
+on how many clusters you can create in the same network with IP aliasing
+on - you just run out of addresses. This also gives us some isolation -
+subnetworks are isolated by default and can\'t reach other resources.
+You must add [firewall
+rules](https://cloud.google.com/vpc/docs/using-firewalls) to provide
+access, including access to any manually run NFS servers. We add tags
+for this.
+
+### Tags
+
+To help with firewalling, we add [network
+tags](https://cloud.google.com/vpc/docs/add-remove-network-tags) to all
+our cluster nodes. This lets us add firewall rules to control traffic
+between subnetworks.
+
+### Cluster name
+
+We try use a descriptive name as much as possible.