From bea79967e89d09aa8a50af0f2ba813c3ed6547a0 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Mon, 12 Jun 2023 23:51:52 +0100 Subject: [PATCH 01/32] Add new chart for managing kube-system charts --- .github/workflows/cd.yml | 2 ++ .gitignore | 6 +++--- chartpress.yaml | 1 + config-kube-system/aws-curvenote.yaml | 19 +++++++++++++++++++ config/aws-curvenote.yaml | 0 mybinder-kube-system/Chart.yaml | 17 +++++++++++++++++ mybinder-kube-system/values.yaml | 7 +++++++ 7 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 config-kube-system/aws-curvenote.yaml create mode 100644 config/aws-curvenote.yaml create mode 100644 mybinder-kube-system/Chart.yaml create mode 100644 mybinder-kube-system/values.yaml diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 2e4910389..d9c8a2e7d 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -152,6 +152,7 @@ jobs: run: | curl -sf https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | DESIRED_VERSION=${HELM_VERSION} bash helm dependency update ./mybinder + helm dependency update ./mybinder-kube-system # Action Repo: https://github.com/sliteteam/github-action-git-crypt-unlock - name: "Stage 2: Unlock git-crypt secrets" @@ -290,6 +291,7 @@ jobs: run: | curl -sf https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | DESIRED_VERSION=${HELM_VERSION} bash helm dependency update ./mybinder + helm dependency update ./mybinder-kube-system - name: "Stage 2: Unlock git-crypt secrets" uses: sliteteam/github-action-git-crypt-unlock@8b1fa3ccc81e322c5c45fbab261eee46513fd3f8 diff --git a/.gitignore b/.gitignore index 6f108494c..897a44867 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,9 @@ __pycache__ config/common/datacenter-*.yaml secrets/banned_hosts.txt secrets/config/common/bans.yaml -mybinder/charts -mybinder/requirements.lock -mybinder/Chart.lock +mybinder*/charts +mybinder*/requirements.lock +mybinder*/Chart.lock .ipynb_checkpoints diff --git a/chartpress.yaml b/chartpress.yaml index d2a420d8b..77a85d5d7 100644 --- a/chartpress.yaml +++ b/chartpress.yaml @@ -10,3 +10,4 @@ charts: valuesPath: minesweeper.image tc-init: valuesPath: binderhub.jupyterhub.singleuser.initContainers.0.image + - name: mybinder-kube-system diff --git a/config-kube-system/aws-curvenote.yaml b/config-kube-system/aws-curvenote.yaml new file mode 100644 index 000000000..17e781060 --- /dev/null +++ b/config-kube-system/aws-curvenote.yaml @@ -0,0 +1,19 @@ +# Install the more modern load-balancer controller: +# https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html +aws-load-balancer-controller: + enabled: true + clusterName: binderhub + # Must match the IRSA service account name + name: aws-load-balancer-controller + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/IRSA-aws-load-balancer-controller" + +aws-ebs-csi-driver: + enabled: true + controller: + serviceAccount: + # Must match the IRSA service account name + name: ebs-csi-controller-sa + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/IRSA-aws-ebs-csi-driver" diff --git a/config/aws-curvenote.yaml b/config/aws-curvenote.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/mybinder-kube-system/Chart.yaml b/mybinder-kube-system/Chart.yaml new file mode 100644 index 000000000..5adf7a553 --- /dev/null +++ b/mybinder-kube-system/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +description: A meta-chart for the kube-system charts on some mybinder.org hosts +name: mybinder-kube-system +version: "0.0.1-set.by.chartpress" +kubeVersion: ">= 1.26.0-0" +dependencies: + # https://artifacthub.io/packages/helm/aws/aws-load-balancer-controller + - name: aws-load-balancer-controller + version: 1.5.3 + repository: https://aws.github.io/eks-charts + condition: aws-load-balancer-controller.enabled + + # https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md + - name: aws-ebs-csi-driver + version: 2.17.2 + repository: https://kubernetes-sigs.github.io/aws-ebs-csi-driver + condition: aws-ebs-csi-driver.enabled diff --git a/mybinder-kube-system/values.yaml b/mybinder-kube-system/values.yaml new file mode 100644 index 000000000..f62144f76 --- /dev/null +++ b/mybinder-kube-system/values.yaml @@ -0,0 +1,7 @@ +# AWS EKS load-balancer controller +aws-load-balancer-controller: + enabled: false + +# AWS EKS storage (EBS) controller +aws-ebs-csi-driver: + enabled: false From fc3fdb574c2b09668bd4dae7c90ecc22d005b280 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 13 Jun 2023 23:53:18 +0100 Subject: [PATCH 02/32] Add aws curvenote binderhub Use manual build of https://github.com/jupyterhub/binderhub/pull/1724 --- config-kube-system/aws-curvenote.yaml | 19 -- config-kube-system/curvenote.yaml | 31 +++ config/aws-curvenote.yaml | 0 config/curvenote.yaml | 297 ++++++++++++++++++++++++++ deploy.py | 75 ++++++- mybinder/Chart.yaml | 7 + mybinder/values.yaml | 3 + 7 files changed, 408 insertions(+), 24 deletions(-) delete mode 100644 config-kube-system/aws-curvenote.yaml create mode 100644 config-kube-system/curvenote.yaml delete mode 100644 config/aws-curvenote.yaml create mode 100644 config/curvenote.yaml diff --git a/config-kube-system/aws-curvenote.yaml b/config-kube-system/aws-curvenote.yaml deleted file mode 100644 index 17e781060..000000000 --- a/config-kube-system/aws-curvenote.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Install the more modern load-balancer controller: -# https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html -aws-load-balancer-controller: - enabled: true - clusterName: binderhub - # Must match the IRSA service account name - name: aws-load-balancer-controller - serviceAccount: - annotations: - eks.amazonaws.com/role-arn: "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/IRSA-aws-load-balancer-controller" - -aws-ebs-csi-driver: - enabled: true - controller: - serviceAccount: - # Must match the IRSA service account name - name: ebs-csi-controller-sa - annotations: - eks.amazonaws.com/role-arn: "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/IRSA-aws-ebs-csi-driver" diff --git a/config-kube-system/curvenote.yaml b/config-kube-system/curvenote.yaml new file mode 100644 index 000000000..277dfd9f9 --- /dev/null +++ b/config-kube-system/curvenote.yaml @@ -0,0 +1,31 @@ +# Install the more modern load-balancer controller: +# https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html +aws-load-balancer-controller: + enabled: true + clusterName: binderhub + clusterSecretsPermissions: + allowAllSecrets: true + enableShield: false + enableWaf: false + enableWafv2: false + logLevel: debug + serviceAccount: + # Must match the IRSA service account name + name: aws-load-balancer-controller + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::166088433508:role/binderhub-IRSA-aws-load-balancer-controller" + +aws-ebs-csi-driver: + enabled: true + controller: + serviceAccount: + # Must match the IRSA service account name + name: ebs-csi-controller-sa + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::166088433508:role/binderhub-IRSA-ebs-csi-controller-sa" + storageClasses: + - name: ebs-sc + # Note this results in EKS having two default StorageClasses, so to be sure + # always specify the storage class in the PVC. + annotations: + storageclass.kubernetes.io/is-default-class: "true" diff --git a/config/aws-curvenote.yaml b/config/aws-curvenote.yaml deleted file mode 100644 index e69de29bb..000000000 diff --git a/config/curvenote.yaml b/config/curvenote.yaml new file mode 100644 index 000000000..78a5f2050 --- /dev/null +++ b/config/curvenote.yaml @@ -0,0 +1,297 @@ +projectName: curvenote + +# userNodeSelector: &userNodeSelector +# mybinder.org/pool-type: users +# coreNodeSelector: &coreNodeSelector +# mybinder.org/pool-type: core + +binderhub: + # Manual build with https://github.com/jupyterhub/binderhub/pull/1637 + image: + name: docker.io/manics/binderhub + tag: pr1724-2023-06-27-21-50-amd64 + + config: + BinderHub: + # hub_url: https://hub.curvenote.mybinder.org + hub_url: https://hub.3.13.147.101.nip.io + hub_url_local: http://proxy-public + badge_base_url: https://mybinder.org + # build_node_selector: + # mybinder.org/pool-type: builds + sticky_builds: true + image_prefix: 166088433508.dkr.ecr.us-east-2.amazonaws.com/binderhub/ + # log_level: DEBUG + # TODO: we should have CPU requests, too + # use this to limit the number of builds per node + # complicated: dind memory request + KubernetesBuildExecutor.memory_request * builds_per_node ~= node memory + KubernetesBuildExecutor: + memory_request: "2G" + + LaunchQuota: + total_quota: 10 + + registry: + url: 166088433508.dkr.ecr.us-east-2.amazonaws.com + username: "" + password: "" + + replicas: 1 + # nodeSelector: *coreNodeSelector + + # extraVolumes: + # - name: secrets + # secret: + # secretName: events-archiver-secrets + # extraVolumeMounts: + # - name: secrets + # mountPath: /secrets + # readOnly: true + # extraEnv: + # GOOGLE_APPLICATION_CREDENTIALS: /secrets/service-account.json + + extraConfig: + 01-eventlog: | + # Disabled until GOOGLE_APPLICATION_CREDENTIALS secret is available + + 10-external-registry-helper: | + # from binderhub.registry import ExternalRegistryHelper + import json + from tornado import httpclient + from traitlets import Unicode + from binderhub.registry import DockerRegistry + + + class ExternalRegistryHelper(DockerRegistry): + + service_url = Unicode( + "http://binderhub-container-registry-helper:8080", + allow_none=False, + help="The URL of the registry helper micro-service.", + config=True, + ) + + auth_token = Unicode( + "secret-token", + help="The auth token to use when accessing the registry helper micro-service.", + config=True, + ) + + async def _request(self, endpoint, **kwargs): + client = httpclient.AsyncHTTPClient() + repo_url = f"{self.service_url}{endpoint}" + headers = {"Authorization": f"Bearer {self.auth_token}"} + repo = await client.fetch(repo_url, headers=headers, **kwargs) + return json.loads(repo.body.decode("utf-8")) + + async def _get_image(self, image, tag): + repo_url = f"/image/{image}:{tag}" + self.log.debug(f"Checking whether image exists: {repo_url}") + try: + image_json = await self._request(repo_url) + return image_json + except httpclient.HTTPError as e: + if e.code == 404: + return None + else: + raise + + async def get_image_manifest(self, image, tag): + """ + Checks whether the image exists in the registry. + + If the container repository doesn't exist create the repository. + + The container repository name may not be the same as the BinderHub image name. + + E.g. Oracle Container Registry (OCIR) has the form: + OCIR_NAMESPACE/OCIR_REPOSITORY_NAME:TAG + + These extra components are handled automatically by the registry helper + so BinderHub repository names such as OCIR_NAMESPACE/OCIR_REPOSITORY_NAME + can be used directly, it is not necessary to remove the extra components. + + Returns the image manifest if the image exists, otherwise None + """ + + repo_url = f"/repo/{image}" + self.log.debug(f"Checking whether repository exists: {repo_url}") + try: + repo_json = await self._request(repo_url) + except httpclient.HTTPError as e: + if e.code == 404: + repo_json = None + else: + raise + + if repo_json: + return await self._get_image(image, tag) + else: + self.log.debug(f"Creating repository: {repo_url}") + await self._request(repo_url, method="POST", body="") + return None + + async def get_credentials(self, image, tag): + token_url = f"/token/{image}:{tag}" + self.log.debug(f"Getting registry token: {token_url}") + token_json = None + try: + token_json = await self._request(token_url, method="POST", body="") + except httpclient.HTTPError as e: + if e.code != 404: + raise + token = dict((k, v) for (k, v) in token_json.items() if k in ["username", "password", "registry"]) + self.log.debug(f"Returning registry token: {token}") + return token + + c.BinderHub.registry_class = ExternalRegistryHelper + c.ExternalRegistryHelper.service_url = "http://curvenote-binderhub-container-registry-helper:8080" + c.ExternalRegistryHelper.auth_token = "secret-token-use-existing-secret-instead" + + dind: + resources: + requests: + cpu: "4" + memory: 16Gi + limits: + cpu: "7" + memory: 24Gi + + ingress: + hosts: + # - curvenote.mybinder.org + - 3.13.147.101.nip.io + + jupyterhub: + # singleuser: + # nodeSelector: *userNodeSelector + # hub: + # nodeSelector: *coreNodeSelector + hub: + db: + pvc: + storageClassName: ebs-sc + + singleuser: + initContainers: + - name: tc-init + image: jupyterhub/mybinder.org-tc-init:2020.12.4-0.dev.git.4289.h140cef52 + imagePullPolicy: IfNotPresent + env: + - name: WHITELIST_CIDR + value: 10.0.0.0/8 + - name: EGRESS_BANDWIDTH + value: 1mbit + securityContext: + # capabilities.add seems to be disabled + # by the `runAsUser: 1000` in the pod-level securityContext + # unless we explicitly run as root + runAsUser: 0 + capabilities: + add: + - NET_ADMIN + + proxy: + chp: + # nodeSelector: *coreNodeSelector + resources: + requests: + cpu: "1" + limits: + cpu: "1" + ingress: + hosts: + # - hub.curvenote.mybinder.org + - hub.3.13.147.101.nip.io + tls: + - secretName: kubelego-tls-hub + hosts: + # - hub.curvenote.mybinder.org + - hub.3.13.147.101.nip.io + scheduling: + userPlaceholder: + enabled: false + replicas: 50 + userScheduler: + enabled: false + # nodeSelector: *coreNodeSelector + cull: + # maxAge: 15 min since we're just testing + maxAge: 900 + + imageCleaner: + # Use 40GB as upper limit, size is given in bytes + imageGCThresholdHigh: 40e9 + imageGCThresholdLow: 30e9 + imageGCThresholdType: "absolute" + +cryptnono: + enabled: false + +grafana: + enabled: false + # nodeSelector: *coreNodeSelector + ingress: + hosts: + # - grafana.curvenote.mybinder.org + tls: + - hosts: + # - grafana.curvenote.mybinder.org + secretName: kubelego-tls-grafana + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: prometheus + orgId: 1 + type: prometheus + url: https://prometheus.curvenote.mybinder.org + access: direct + isDefault: true + editable: false + persistence: + storageClassName: csi-cinder-high-speed + +prometheus: + enabled: false + server: + # nodeSelector: *coreNodeSelector + persistentVolume: + size: 50Gi + retention: 30d + ingress: + hosts: + # - prometheus.curvenote.mybinder.org + tls: + - hosts: + # - prometheus.curvenote.mybinder.org + secretName: kubelego-tls-prometheus + +ingress-nginx: + controller: + service: + # loadBalancerIP: 162.19.17.37 + annotations: + service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" + +static: + ingress: + hosts: + # - static.curvenote.mybinder.org + - static.3.13.147.101.nip.io + +minesweeper: + # Requires secrets + enabled: false + image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5080.hf35cc80d + + +binderhub-container-registry-helper: + enabled: true + auth_token: secret-token-use-existing-secret-instead + # auth_existing_secret_name: + replicaCount: 2 + serviceAccount: + name: binderhub-container-registry-helper + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::166088433508:role/binderhub-IRSA-aws-binderhub-ecr" diff --git a/deploy.py b/deploy.py index 61a1f5274..077519cdb 100755 --- a/deploy.py +++ b/deploy.py @@ -29,6 +29,9 @@ "prod": "us-central1", } +# Mapping of config name to cluster name for AWS EKS deployments +AWS_DEPLOYMENTS = {"curvenote": "binderhub"} + # Mapping of cluster names (keys) to resource group names (values) for Azure deployments AZURE_RGs = {} @@ -140,6 +143,27 @@ def setup_auth_gcloud(release, cluster=None, dry_run=False): ) +def setup_auth_aws(cluster, dry_run=False): + """ + Set up authentication for EKS on AWS + + Assumes you already have an AWS CLI profile setup with access to EKS, + and that either this is the default profile (e.g. on CI) or you have set the + AWS_PROFILE environment variable. + """ + print(BOLD + GREEN + f"Obtaining AWS EKS kubeconfig for {cluster}" + NC, flush=True) + + eks_kubeconfig = [ + "aws", + "eks", + "update-kubeconfig", + "--name", + AWS_DEPLOYMENTS[cluster], + ] + stdout = check_output(eks_kubeconfig, dry_run) + print(stdout) + + def update_networkbans(cluster, dry_run=False): """ Run secrets/ban.py to update network bans @@ -160,12 +184,14 @@ def get_config_files(release, config_dir="config"): """Return the list of config files to load""" # common config files config_files = sorted(glob.glob(os.path.join(config_dir, "common", "*.yaml"))) - config_files.extend( - sorted(glob.glob(os.path.join("secrets", config_dir, "common", "*.yaml"))) - ) + # config_files.extend( + # sorted(glob.glob(os.path.join("secrets", config_dir, "common", "*.yaml"))) + # ) # release-specific config files for config_dir in (config_dir, os.path.join("secrets", config_dir)): - config_files.append(os.path.join(config_dir, release + ".yaml")) + f = os.path.join(config_dir, release + ".yaml") + if os.path.exists(f): + config_files.append(f) return config_files @@ -309,6 +335,41 @@ def patch_coredns(dry_run=False): ) +def deploy_kube_system_charts(release, name=None, dry_run=False): + """ + Some charts must be deployed into the kube-system namespace + """ + if not name: + name = release + log_name = f"mybinder-kube-system {release}" + + config_files = get_config_files(release, config_dir="config-kube-system") + if not config_files: + print(BOLD + GREEN + f"No config files found for {log_name}" + NC, flush=True) + return + + print(BOLD + GREEN + f"Starting helm upgrade for {log_name}" + NC, flush=True) + helm = [ + "helm", + "upgrade", + "--install", + "--cleanup-on-fail", + "--namespace=kube-system", + name, + "mybinder-kube-system", + ] + for config_file in config_files: + helm.extend(["-f", config_file]) + + check_call(helm, dry_run) + print( + BOLD + GREEN + f"SUCCESS: Helm upgrade for {log_name} completed" + NC, + flush=True, + ) + + wait_for_deployments_daemonsets("kube-system", dry_run) + + def main(): # parse command line args argparser = argparse.ArgumentParser() @@ -320,6 +381,7 @@ def main(): "prod", "ovh", "ovh2", + "curvenote", ], ) argparser.add_argument( @@ -383,10 +445,13 @@ def main(): setup_auth_azure(cluster, args.dry_run) elif cluster in GCP_PROJECTS: setup_auth_gcloud(args.release, cluster, args.dry_run) + elif cluster in AWS_DEPLOYMENTS: + setup_auth_aws(cluster, args.dry_run) else: raise Exception("Cloud cluster not recognised!") - update_networkbans(cluster, args.dry_run) + # update_networkbans(cluster, args.dry_run) + deploy_kube_system_charts(args.release, args.name, args.dry_run) deploy(args.release, args.name, args.dry_run) diff --git a/mybinder/Chart.yaml b/mybinder/Chart.yaml index ae9ad6ace..135f8d143 100644 --- a/mybinder/Chart.yaml +++ b/mybinder/Chart.yaml @@ -63,3 +63,10 @@ dependencies: version: 9.21.1 repository: https://kubernetes.github.io/autoscaler condition: cluster-autoscaler.enabled + + # Registry helper, used to create container repositories before pushing and to + # fetch dynamic registry credentials + - name: binderhub-container-registry-helper + version: 0.2.0 + repository: oci://ghcr.io/manics/binderhub-container-registry-helper + condition: binderhub-container-registry-helper.enabled diff --git a/mybinder/values.yaml b/mybinder/values.yaml index e33e9c5e3..d5d986762 100644 --- a/mybinder/values.yaml +++ b/mybinder/values.yaml @@ -600,3 +600,6 @@ minesweeper: # cluster-autoscaler: enabled: false + +binderhub-container-registry-helper: + enabled: false From 728ce81387f5a3871b8f8a57c9d6b9868a61536c Mon Sep 17 00:00:00 2001 From: Simon Li Date: Wed, 28 Jun 2023 00:19:00 +0100 Subject: [PATCH 03/32] Add aws ecr cleaner --- config/curvenote.yaml | 9 +++- .../aws-ecr-registry-cleaner/deployment.yaml | 45 +++++++++++++++++++ .../serviceaccount.yaml | 12 +++++ mybinder/values.yaml | 10 +++++ 4 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml create mode 100644 mybinder/templates/aws-ecr-registry-cleaner/serviceaccount.yaml diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 78a5f2050..08d4fed80 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -270,7 +270,7 @@ prometheus: ingress-nginx: controller: service: - # loadBalancerIP: 162.19.17.37 + # loadBalancerIP: 162.19.17.37 annotations: service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" @@ -285,7 +285,6 @@ minesweeper: enabled: false image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5080.hf35cc80d - binderhub-container-registry-helper: enabled: true auth_token: secret-token-use-existing-secret-instead @@ -295,3 +294,9 @@ binderhub-container-registry-helper: name: binderhub-container-registry-helper annotations: eks.amazonaws.com/role-arn: "arn:aws:iam::166088433508:role/binderhub-IRSA-aws-binderhub-ecr" + +awsEcrRegistryCleaner: + enabled: true + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::166088433508:role/binderhub-IRSA-aws-binderhub-ecr-registry-cleaner" diff --git a/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml b/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml new file mode 100644 index 000000000..ed440f8f5 --- /dev/null +++ b/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml @@ -0,0 +1,45 @@ +{{- if .Values.awsEcrRegistryCleaner.enabled -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: aws-ecr-registry-cleaner + labels: + app: aws-ecr-registry-cleaner + component: aws-ecr-registry-cleaner + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +spec: + # Never run more than one cleaner pod at a time + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: aws-ecr-registry-cleaner + component: aws-ecr-registry-cleaner + release: {{ .Release.Name }} + replicas: 1 + template: + metadata: + labels: + app: aws-ecr-registry-cleaner + component: aws-ecr-registry-cleaner + release: {{ .Release.Name }} + spec: + containers: + - name: cleaner + image: {{ .Values.awsEcrRegistryCleaner.image }} + args: + - -expires-after-pull-days={{ .Values.awsEcrRegistryCleaner.expiresAfterPullDays }} + - -loop-delay={{ .Values.awsEcrRegistryCleaner.loopDelay }} + tolerations: + - effect: NoSchedule + key: hub.jupyter.org/dedicated + operator: Equal + value: user + - effect: NoSchedule + key: hub.jupyter.org_dedicated + operator: Equal + value: user + serviceAccountName: binderhub-ecr-registry-cleaner +{{- end }} diff --git a/mybinder/templates/aws-ecr-registry-cleaner/serviceaccount.yaml b/mybinder/templates/aws-ecr-registry-cleaner/serviceaccount.yaml new file mode 100644 index 000000000..2a74f7df5 --- /dev/null +++ b/mybinder/templates/aws-ecr-registry-cleaner/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.awsEcrRegistryCleaner.enabled -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Values.awsEcrRegistryCleaner.serviceAccount.name }} + labels: + {{- include "binderhub-container-registry-helper.labels" . | nindent 4 }} + {{- with .Values.awsEcrRegistryCleaner.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/mybinder/values.yaml b/mybinder/values.yaml index d5d986762..220f8cb9e 100644 --- a/mybinder/values.yaml +++ b/mybinder/values.yaml @@ -603,3 +603,13 @@ cluster-autoscaler: binderhub-container-registry-helper: enabled: false + +awsEcrRegistryCleaner: + enabled: false + image: ghcr.io/manics/aws-ecr-registry-cleaner:0.0.1 + expiresAfterPullDays: 1 + # 12 hours + loopDelay: 43200 + serviceAccount: + name: binderhub-ecr-registry-cleaner + annotations: {} From 16c5ae2805cfa0e284bd38626701433196e84e60 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 30 Jun 2023 14:23:52 +0100 Subject: [PATCH 04/32] deploy.py: add stages for interactive use --- deploy.py | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/deploy.py b/deploy.py index 077519cdb..b9d66fa66 100755 --- a/deploy.py +++ b/deploy.py @@ -198,17 +198,11 @@ def get_config_files(release, config_dir="config"): def deploy(release, name=None, dry_run=False): """Deploys a federation member to a k8s cluster. - The deployment is done in the following steps: - - 1. Deploy cert-manager - 2. Deploy mybinder helm chart - 3. Await deployed deployment and daemonsets to become Ready + Waits for deployments and daemonsets to become Ready """ if not name: name = release - setup_certmanager(dry_run) - print(BOLD + GREEN + f"Starting helm upgrade for {release}" + NC, flush=True) helm = [ "helm", @@ -404,6 +398,13 @@ def main(): action="store_true", help="Print commands, but don't run them", ) + stages = ["all", "auth", "networkbans", "kubesystem", "certmanager", "mybinder"] + argparser.add_argument( + "--stage", + choices=stages, + default=stages[0], + help="Stage to deploy, default all", + ) args = argparser.parse_args() @@ -438,21 +439,27 @@ def main(): # script is running on CI, proceed with auth and helm setup - if cluster.startswith("ovh"): - setup_auth_ovh(args.release, cluster, args.dry_run) - patch_coredns(args.dry_run) - elif cluster in AZURE_RGs: - setup_auth_azure(cluster, args.dry_run) - elif cluster in GCP_PROJECTS: - setup_auth_gcloud(args.release, cluster, args.dry_run) - elif cluster in AWS_DEPLOYMENTS: - setup_auth_aws(cluster, args.dry_run) - else: - raise Exception("Cloud cluster not recognised!") - - # update_networkbans(cluster, args.dry_run) - deploy_kube_system_charts(args.release, args.name, args.dry_run) - deploy(args.release, args.name, args.dry_run) + if args.stage in ("all", "auth"): + if cluster.startswith("ovh"): + setup_auth_ovh(args.release, cluster, args.dry_run) + patch_coredns(args.dry_run) + elif cluster in AZURE_RGs: + setup_auth_azure(cluster, args.dry_run) + elif cluster in GCP_PROJECTS: + setup_auth_gcloud(args.release, cluster, args.dry_run) + elif cluster in AWS_DEPLOYMENTS: + setup_auth_aws(cluster, args.dry_run) + else: + raise Exception("Cloud cluster not recognised!") + + if args.stage in ("all", "networkban"): + update_networkbans(cluster, args.dry_run) + if args.stage in ("all", "kubesystem"): + deploy_kube_system_charts(args.release, args.name, args.dry_run) + if args.stage in ("all", "certmanager"): + setup_certmanager(args.dry_run) + if args.stage in ("all", "mybinder"): + deploy(args.release, args.name, args.dry_run) if __name__ == "__main__": From f624b5a0436b075964163c693108db6a5dcb8a16 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 30 Jun 2023 20:09:10 +0100 Subject: [PATCH 05/32] enableServiceLinks: false --- config/curvenote.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 08d4fed80..fba206fc6 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -171,6 +171,10 @@ binderhub: db: pvc: storageClassName: ebs-sc + config: + KubeSpawner: + extra_pod_config: + enableServiceLinks: false singleuser: initContainers: From 1c607b61e57eca3f61b4d10101373b0ed4ad333e Mon Sep 17 00:00:00 2001 From: Simon Li Date: Wed, 12 Jul 2023 21:24:10 +0100 Subject: [PATCH 06/32] deploy.py: uncomment secret config loading --- config/curvenote.yaml | 5 ----- deploy.py | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index fba206fc6..04a92fac8 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -6,11 +6,6 @@ projectName: curvenote # mybinder.org/pool-type: core binderhub: - # Manual build with https://github.com/jupyterhub/binderhub/pull/1637 - image: - name: docker.io/manics/binderhub - tag: pr1724-2023-06-27-21-50-amd64 - config: BinderHub: # hub_url: https://hub.curvenote.mybinder.org diff --git a/deploy.py b/deploy.py index b9d66fa66..1c2e0c0b6 100755 --- a/deploy.py +++ b/deploy.py @@ -184,9 +184,9 @@ def get_config_files(release, config_dir="config"): """Return the list of config files to load""" # common config files config_files = sorted(glob.glob(os.path.join(config_dir, "common", "*.yaml"))) - # config_files.extend( - # sorted(glob.glob(os.path.join("secrets", config_dir, "common", "*.yaml"))) - # ) + config_files.extend( + sorted(glob.glob(os.path.join("secrets", config_dir, "common", "*.yaml"))) + ) # release-specific config files for config_dir in (config_dir, os.path.join("secrets", config_dir)): f = os.path.join(config_dir, release + ".yaml") From 683a8a9119b070151a1368a4028b4bdbce5b3fe0 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Wed, 12 Jul 2023 21:24:22 +0100 Subject: [PATCH 07/32] aws-ecr-registry-cleaner: 7 days --- mybinder/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mybinder/values.yaml b/mybinder/values.yaml index 220f8cb9e..9962a60ad 100644 --- a/mybinder/values.yaml +++ b/mybinder/values.yaml @@ -607,7 +607,7 @@ binderhub-container-registry-helper: awsEcrRegistryCleaner: enabled: false image: ghcr.io/manics/aws-ecr-registry-cleaner:0.0.1 - expiresAfterPullDays: 1 + expiresAfterPullDays: 7 # 12 hours loopDelay: 43200 serviceAccount: From e8f9a9699bc4f34f96435bdda27a35470e72d17a Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 22 Jul 2023 20:41:25 +0100 Subject: [PATCH 08/32] Use oci://ghcr.io/manics/oci-helm-charts/binderhub-container-registry-helper:0.2.1 --- mybinder/Chart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mybinder/Chart.yaml b/mybinder/Chart.yaml index 135f8d143..39e509a25 100644 --- a/mybinder/Chart.yaml +++ b/mybinder/Chart.yaml @@ -67,6 +67,6 @@ dependencies: # Registry helper, used to create container repositories before pushing and to # fetch dynamic registry credentials - name: binderhub-container-registry-helper - version: 0.2.0 - repository: oci://ghcr.io/manics/binderhub-container-registry-helper + version: 0.2.1 + repository: oci://ghcr.io/manics/oci-helm-charts condition: binderhub-container-registry-helper.enabled From dadee41ad6555061fda6c1706af2e25a68cc2a4f Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 22 Jul 2023 20:41:48 +0100 Subject: [PATCH 09/32] KubeSpawner.image_pull_policy Always --- config/curvenote.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 04a92fac8..8b39ce114 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -170,6 +170,7 @@ binderhub: KubeSpawner: extra_pod_config: enableServiceLinks: false + image_pull_policy: Always singleuser: initContainers: From 0d141e25b3b75bea8383f217646ced301699d7f3 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Tue, 1 Aug 2023 00:02:38 +0100 Subject: [PATCH 10/32] curvenote binderhub: maxAge 1 hour --- config/curvenote.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 8b39ce114..e108433ae 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -216,8 +216,8 @@ binderhub: enabled: false # nodeSelector: *coreNodeSelector cull: - # maxAge: 15 min since we're just testing - maxAge: 900 + # maxAge: 1 hour since we're just testing + maxAge: 3600 imageCleaner: # Use 40GB as upper limit, size is given in bytes From 409a8ff21df661acd6259ec96625f4016948ceae Mon Sep 17 00:00:00 2001 From: Simon Li Date: Mon, 25 Sep 2023 22:37:20 +0100 Subject: [PATCH 11/32] Add binder.curvenote.dev hub.binder.curvenote.dev --- config/curvenote.yaml | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index e108433ae..ee50e8e8d 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -8,8 +8,7 @@ projectName: curvenote binderhub: config: BinderHub: - # hub_url: https://hub.curvenote.mybinder.org - hub_url: https://hub.3.13.147.101.nip.io + hub_url: https://hub.binder.curvenote.dev hub_url_local: http://proxy-public badge_base_url: https://mybinder.org # build_node_selector: @@ -154,8 +153,7 @@ binderhub: ingress: hosts: - # - curvenote.mybinder.org - - 3.13.147.101.nip.io + - binder.curvenote.dev jupyterhub: # singleuser: @@ -201,13 +199,11 @@ binderhub: cpu: "1" ingress: hosts: - # - hub.curvenote.mybinder.org - - hub.3.13.147.101.nip.io + - hub.binder.curvenote.dev tls: - secretName: kubelego-tls-hub hosts: - # - hub.curvenote.mybinder.org - - hub.3.13.147.101.nip.io + - hub.binder.curvenote.dev scheduling: userPlaceholder: enabled: false @@ -270,14 +266,13 @@ prometheus: ingress-nginx: controller: service: - # loadBalancerIP: 162.19.17.37 annotations: service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" static: ingress: hosts: - # - static.curvenote.mybinder.org + # - static.binder.curvenote.dev - static.3.13.147.101.nip.io minesweeper: From 767cdbb856802e193b9a28bf0dfe5f6cd7ac0228 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 13:41:19 +0100 Subject: [PATCH 12/32] Update analyticsPublisher and minesweeper images --- config/curvenote.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index ee50e8e8d..c4d13a369 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -275,10 +275,15 @@ static: # - static.binder.curvenote.dev - static.3.13.147.101.nip.io +analyticsPublisher: + image: + # name: jupyterhub/mybinder.org-analytics-publisher + tag: 2020.12.4-0.dev.git.5220.hdf4d139f + minesweeper: # Requires secrets enabled: false - image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5080.hf35cc80d + image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5220.hbe9f3f64 binderhub-container-registry-helper: enabled: true From 66eb7e5b7b214d9d28f3c36f70eb16f91672fae5 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 13:41:55 +0100 Subject: [PATCH 13/32] Add notes/config on AWS EKS VPC CNI --- .prettierignore | 1 + terraform/aws/curvenote/cni/README.md | 20 + .../curvenote/cni/aws-k8s-cni-us-east-2.yaml | 564 ++++++++++++++++++ 3 files changed, 585 insertions(+) create mode 100644 terraform/aws/curvenote/cni/README.md create mode 100644 terraform/aws/curvenote/cni/aws-k8s-cni-us-east-2.yaml diff --git a/.prettierignore b/.prettierignore index 9df8843e7..0f7ad7494 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1 +1,2 @@ mybinder/templates/ +terraform/aws/curvenote/cni/ diff --git a/terraform/aws/curvenote/cni/README.md b/terraform/aws/curvenote/cni/README.md new file mode 100644 index 000000000..0f4f175e9 --- /dev/null +++ b/terraform/aws/curvenote/cni/README.md @@ -0,0 +1,20 @@ +# Enable NetworkPolicies on EKS + +EKS automatically installs the VPC CNI plugin, but by default NetworkPolicies are not enabled. + +1. Find the recommended version of the VPC CNI plugin + https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html +2. Download the VPC-CNI Kubernetes manifest, replacing `1.15.0` with the recommended version + ``` + curl -O https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/v1.15.0/config/master/aws-k8s-cni.yaml + ``` +3. Edit `aws-k8s-cni.yaml`: + - Change all mentions of `us-west-2` to your region + - Update the manifest following the `kubectl` instructions in + https://docs.aws.amazon.com/eks/latest/userguide/cni-network-policy.html + - Add `enable-network-policy-controller: "true"` to the `aws-node` ConfigMap + - Set `--enable-network-policy=true` in the `aws-node` DaemonSet `aws-network-policy-agent` container +4. Apply: + ``` + kubectl apply -f cni/aws-k8s-cni.yaml + ``` diff --git a/terraform/aws/curvenote/cni/aws-k8s-cni-us-east-2.yaml b/terraform/aws/curvenote/cni/aws-k8s-cni-us-east-2.yaml new file mode 100644 index 000000000..f7886dc0d --- /dev/null +++ b/terraform/aws/curvenote/cni/aws-k8s-cni-us-east-2.yaml @@ -0,0 +1,564 @@ +--- +# Source: crds/customresourcedefinition.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: eniconfigs.crd.k8s.amazonaws.com +spec: + scope: Cluster + group: crd.k8s.amazonaws.com + preserveUnknownFields: false + versions: + - name: v1alpha1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + x-kubernetes-preserve-unknown-fields: true + names: + plural: eniconfigs + singular: eniconfig + kind: ENIConfig +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.11.3 + creationTimestamp: null + labels: + app.kubernetes.io/name: amazon-network-policy-controller-k8s + name: policyendpoints.networking.k8s.aws +spec: + group: networking.k8s.aws + names: + kind: PolicyEndpoint + listKind: PolicyEndpointList + plural: policyendpoints + singular: policyendpoint + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: PolicyEndpoint is the Schema for the policyendpoints API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: PolicyEndpointSpec defines the desired state of PolicyEndpoint + properties: + egress: + description: Egress is the list of egress rules containing resolved + network addresses + items: + description: EndpointInfo defines the network endpoint information + for the policy ingress/egress + properties: + cidr: + description: CIDR is the network address(s) of the endpoint + type: string + except: + description: Except is the exceptions to the CIDR ranges mentioned + above. + items: + type: string + type: array + ports: + description: Ports is the list of ports + items: + description: Port contains information about the transport + port/protocol + properties: + endPort: + description: Endport specifies the port range port to + endPort port must be defined and an integer, endPort + > port + format: int32 + type: integer + port: + description: Port specifies the numerical port for the + protocol. If empty applies to all ports + format: int32 + type: integer + protocol: + default: TCP + description: Protocol specifies the transport protocol, + default TCP + type: string + type: object + type: array + required: + - cidr + type: object + type: array + ingress: + description: Ingress is the list of ingress rules containing resolved + network addresses + items: + description: EndpointInfo defines the network endpoint information + for the policy ingress/egress + properties: + cidr: + description: CIDR is the network address(s) of the endpoint + type: string + except: + description: Except is the exceptions to the CIDR ranges mentioned + above. + items: + type: string + type: array + ports: + description: Ports is the list of ports + items: + description: Port contains information about the transport + port/protocol + properties: + endPort: + description: Endport specifies the port range port to + endPort port must be defined and an integer, endPort + > port + format: int32 + type: integer + port: + description: Port specifies the numerical port for the + protocol. If empty applies to all ports + format: int32 + type: integer + protocol: + default: TCP + description: Protocol specifies the transport protocol, + default TCP + type: string + type: object + type: array + required: + - cidr + type: object + type: array + podIsolation: + description: PodIsolation specifies whether the pod needs to be isolated + for a particular traffic direction Ingress or Egress, or both. If + default isolation is not specified, and there are no ingress/egress + rules, then the pod is not isolated from the point of view of this + policy. This follows the NetworkPolicy spec.PolicyTypes. + items: + description: PolicyType string describes the NetworkPolicy type + This type is beta-level in 1.8 + type: string + type: array + podSelector: + description: PodSelector is the podSelector from the policy resource + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + podSelectorEndpoints: + description: PodSelectorEndpoints contains information about the pods + matching the podSelector + items: + description: PodEndpoint defines the summary information for the + pods + properties: + hostIP: + description: HostIP is the IP address of the host the pod is + currently running on + type: string + name: + description: Name is the pod name + type: string + namespace: + description: Namespace is the pod namespace + type: string + podIP: + description: PodIP is the IP address of the pod + type: string + required: + - hostIP + - name + - namespace + - podIP + type: object + type: array + policyRef: + description: PolicyRef is a reference to the Kubernetes NetworkPolicy + resource. + properties: + name: + description: Name is the name of the Policy + type: string + namespace: + description: Namespace is the namespace of the Policy + type: string + required: + - name + - namespace + type: object + required: + - policyRef + type: object + status: + description: PolicyEndpointStatus defines the observed state of PolicyEndpoint + type: object + type: object + served: true + storage: true + subresources: + status: {} +--- +# Source: aws-vpc-cni/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: aws-node + namespace: kube-system + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + app.kubernetes.io/version: "v1.15.0" +--- +# Source: aws-vpc-cni/templates/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: amazon-vpc-cni + namespace: kube-system + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + app.kubernetes.io/version: "v1.15.0" +data: + enable-windows-ipam: "false" + enable-network-policy-controller: "true" +--- +# Source: aws-vpc-cni/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: aws-node + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + app.kubernetes.io/version: "v1.15.0" +rules: + - apiGroups: + - crd.k8s.amazonaws.com + resources: + - eniconfigs + verbs: ["list", "watch", "get"] + - apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch", "get"] + - apiGroups: [""] + resources: + - pods + verbs: ["list", "watch", "get"] + - apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch", "get"] + - apiGroups: ["", "events.k8s.io"] + resources: + - events + verbs: ["create", "patch", "list"] + - apiGroups: ["networking.k8s.aws"] + resources: + - policyendpoints + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.aws"] + resources: + - policyendpoints/status + verbs: ["get"] + - apiGroups: + - vpcresources.k8s.aws + resources: + - cninodes + verbs: ["get", "list", "patch"] +--- +# Source: aws-vpc-cni/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: aws-node + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + app.kubernetes.io/version: "v1.15.0" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: aws-node +subjects: + - kind: ServiceAccount + name: aws-node + namespace: kube-system +--- +# Source: aws-vpc-cni/templates/daemonset.yaml +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: aws-node + namespace: kube-system + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + app.kubernetes.io/version: "v1.15.0" +spec: + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + type: RollingUpdate + selector: + matchLabels: + k8s-app: aws-node + template: + metadata: + labels: + app.kubernetes.io/name: aws-node + app.kubernetes.io/instance: aws-vpc-cni + k8s-app: aws-node + spec: + priorityClassName: "system-node-critical" + serviceAccountName: aws-node + hostNetwork: true + initContainers: + - name: aws-vpc-cni-init + image: "602401143452.dkr.ecr.us-east-2.amazonaws.com/amazon-k8s-cni-init:v1.15.0" + env: + - name: DISABLE_TCP_EARLY_DEMUX + value: "false" + - name: ENABLE_IPv6 + value: "false" + securityContext: + privileged: true + resources: + requests: + cpu: 25m + volumeMounts: + - mountPath: /host/opt/cni/bin + name: cni-bin-dir + terminationGracePeriodSeconds: 10 + tolerations: + - operator: Exists + securityContext: + {} + containers: + - name: aws-node + image: "602401143452.dkr.ecr.us-east-2.amazonaws.com/amazon-k8s-cni:v1.15.0" + ports: + - containerPort: 61678 + name: metrics + livenessProbe: + exec: + command: + - /app/grpc-health-probe + - -addr=:50051 + - -connect-timeout=5s + - -rpc-timeout=5s + initialDelaySeconds: 60 + timeoutSeconds: 10 + readinessProbe: + exec: + command: + - /app/grpc-health-probe + - -addr=:50051 + - -connect-timeout=5s + - -rpc-timeout=5s + initialDelaySeconds: 1 + timeoutSeconds: 10 + env: + - name: ADDITIONAL_ENI_TAGS + value: "{}" + - name: AWS_VPC_CNI_NODE_PORT_SUPPORT + value: "true" + - name: AWS_VPC_ENI_MTU + value: "9001" + - name: AWS_VPC_K8S_CNI_CUSTOM_NETWORK_CFG + value: "false" + - name: AWS_VPC_K8S_CNI_EXTERNALSNAT + value: "false" + - name: AWS_VPC_K8S_CNI_LOGLEVEL + value: "DEBUG" + - name: AWS_VPC_K8S_CNI_LOG_FILE + value: "/host/var/log/aws-routed-eni/ipamd.log" + - name: AWS_VPC_K8S_CNI_RANDOMIZESNAT + value: "prng" + - name: AWS_VPC_K8S_CNI_VETHPREFIX + value: "eni" + - name: AWS_VPC_K8S_PLUGIN_LOG_FILE + value: "/var/log/aws-routed-eni/plugin.log" + - name: AWS_VPC_K8S_PLUGIN_LOG_LEVEL + value: "DEBUG" + - name: DISABLE_INTROSPECTION + value: "false" + - name: DISABLE_METRICS + value: "false" + - name: DISABLE_NETWORK_RESOURCE_PROVISIONING + value: "false" + - name: ENABLE_IPv4 + value: "true" + - name: ENABLE_IPv6 + value: "false" + - name: ENABLE_POD_ENI + value: "false" + - name: ENABLE_PREFIX_DELEGATION + value: "false" + - name: WARM_ENI_TARGET + value: "1" + - name: WARM_PREFIX_TARGET + value: "1" + - name: MY_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: MY_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + resources: + requests: + cpu: 25m + securityContext: + capabilities: + add: + - NET_ADMIN + - NET_RAW + volumeMounts: + - mountPath: /host/opt/cni/bin + name: cni-bin-dir + - mountPath: /host/etc/cni/net.d + name: cni-net-dir + - mountPath: /host/var/log/aws-routed-eni + name: log-dir + - mountPath: /var/run/aws-node + name: run-dir + - mountPath: /run/xtables.lock + name: xtables-lock + - name: aws-eks-nodeagent + image: "602401143452.dkr.ecr.us-east-2.amazonaws.com/amazon/aws-network-policy-agent:v1.0.2" + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + args: + - --enable-ipv6=false + - --enable-network-policy=true + - --enable-cloudwatch-logs=false + - --metrics-bind-addr=:8162 + - --health-probe-bind-addr=:8163 + resources: + requests: + cpu: 25m + securityContext: + capabilities: + add: + - NET_ADMIN + privileged: true + volumeMounts: + - mountPath: /host/opt/cni/bin + name: cni-bin-dir + - mountPath: /sys/fs/bpf + name: bpf-pin-path + - mountPath: /var/log/aws-routed-eni + name: log-dir + - mountPath: /var/run/aws-node + name: run-dir + volumes: + - name: bpf-pin-path + hostPath: + path: /sys/fs/bpf + - name: cni-bin-dir + hostPath: + path: /opt/cni/bin + - name: cni-net-dir + hostPath: + path: /etc/cni/net.d + - name: log-dir + hostPath: + path: /var/log/aws-routed-eni + type: DirectoryOrCreate + - name: run-dir + hostPath: + path: /var/run/aws-node + type: DirectoryOrCreate + - name: xtables-lock + hostPath: + path: /run/xtables.lock + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - arm64 + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate From 4eda5165b3bcdd879abcfdb5a97e885907539f1e Mon Sep 17 00:00:00 2001 From: Simon Li Date: Mon, 2 Oct 2023 22:08:43 +0100 Subject: [PATCH 14/32] curvenote: add notes on testing network policies --- terraform/aws/curvenote/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/terraform/aws/curvenote/README.md b/terraform/aws/curvenote/README.md index c0edd26ec..fb706935b 100644 --- a/terraform/aws/curvenote/README.md +++ b/terraform/aws/curvenote/README.md @@ -146,3 +146,12 @@ All access to the Kubernetes cluster is managed using [GitHub OIDC](https://docs AWS secret tokens are not required. AWS API access for BinderHub components, for example to ECR, is managed using [IRSA](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + +## Testing network policies + +``` +git clone https://github.com/jupyterhub/action-k3s-helm +helm upgrade --install netpol -ntest --create-namespace ./action-k3s-helm/test-netpol-enforcement/ +helm test -ntest netpol +helm delete -ntest netpol +``` From 0a17b2d18976b624feba46f578aa9a9a90b32772 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:34:32 +0100 Subject: [PATCH 15/32] binderhub-container-registry-helper 0.2.2 --- mybinder/Chart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mybinder/Chart.yaml b/mybinder/Chart.yaml index 39e509a25..e7c1ffd5c 100644 --- a/mybinder/Chart.yaml +++ b/mybinder/Chart.yaml @@ -67,6 +67,6 @@ dependencies: # Registry helper, used to create container repositories before pushing and to # fetch dynamic registry credentials - name: binderhub-container-registry-helper - version: 0.2.1 - repository: oci://ghcr.io/manics/oci-helm-charts + version: 0.2.2 + repository: oci://quay.io/manics/helm-charts condition: binderhub-container-registry-helper.enabled From 12617a15d825d9373e4f5a8719639a448bb0e59d Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:34:52 +0100 Subject: [PATCH 16/32] Add diff option to deploy.py --- deploy.py | 147 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 46 deletions(-) diff --git a/deploy.py b/deploy.py index 1c2e0c0b6..9fac1a747 100755 --- a/deploy.py +++ b/deploy.py @@ -10,11 +10,12 @@ # Color codes for colored output! if os.environ.get("TERM"): BOLD = subprocess.check_output(["tput", "bold"]).decode() + RED = subprocess.check_output(["tput", "setaf", "1"]).decode() GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode() NC = subprocess.check_output(["tput", "sgr0"]).decode() else: # no term, no colors - BOLD = GREEN = NC = "" + BOLD = RED = GREEN = NC = "" HERE = os.path.dirname(__file__) ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__)) @@ -195,7 +196,7 @@ def get_config_files(release, config_dir="config"): return config_files -def deploy(release, name=None, dry_run=False): +def deploy(release, name=None, dry_run=False, diff=False): """Deploys a federation member to a k8s cluster. Waits for deployments and daemonsets to become Ready @@ -203,17 +204,30 @@ def deploy(release, name=None, dry_run=False): if not name: name = release + if diff: + helm_commands = [ + "diff", + "upgrade", + "--install", + ] + else: + helm_commands = [ + "upgrade", + "--install", + "--cleanup-on-fail", + "--create-namespace", + ] + print(BOLD + GREEN + f"Starting helm upgrade for {release}" + NC, flush=True) - helm = [ - "helm", - "upgrade", - "--install", - "--cleanup-on-fail", - "--create-namespace", - f"--namespace={name}", - name, - "mybinder", - ] + helm = ( + ["helm"] + + helm_commands + + [ + f"--namespace={name}", + name, + "mybinder", + ] + ) config_files = get_config_files(release) # add config files to helm command @@ -222,10 +236,12 @@ def deploy(release, name=None, dry_run=False): check_call(helm, dry_run) print( - BOLD + GREEN + f"SUCCESS: Helm upgrade for {release} completed" + NC, flush=True + BOLD + GREEN + f"SUCCESS: Helm {helm_commands[0]} for {release} completed" + NC, + flush=True, ) - wait_for_deployments_daemonsets(name, dry_run) + if not diff: + wait_for_deployments_daemonsets(name, dry_run) def wait_for_deployments_daemonsets(name, dry_run=False): @@ -269,7 +285,7 @@ def wait_for_deployments_daemonsets(name, dry_run=False): ) -def setup_certmanager(dry_run=False): +def setup_certmanager(dry_run=False, diff=False): """ Install cert-manager separately into its own namespace and `kubectl apply` its CRDs each time as helm won't attempt to handle changes to CRD resources. @@ -286,32 +302,50 @@ def setup_certmanager(dry_run=False): manifest_url = f"https://github.com/jetstack/cert-manager/releases/download/{version}/cert-manager.crds.yaml" print(BOLD + GREEN + f"Installing cert-manager CRDs {version}" + NC, flush=True) + if diff: + kubectl_commands = ["diff"] + helm_commands = [ + "diff", + "upgrade", + "--install", + ] + else: + kubectl_commands = ["apply"] + helm_commands = [ + "upgrade", + "--install", + "--create-namespace", + ] + # Sometimes 'replace' is needed for upgrade (e.g. 1.1->1.2) - check_call(["kubectl", "apply", "-f", manifest_url], dry_run) + check_call(["kubectl"] + kubectl_commands + ["-f", manifest_url], dry_run) print(BOLD + GREEN + f"Installing cert-manager {version}" + NC, flush=True) - helm_upgrade = [ - "helm", - "upgrade", - "--install", - "--create-namespace", - "--namespace=cert-manager", - "--repo=https://charts.jetstack.io", - "cert-manager", - "cert-manager", - f"--version={version}", - "--values=config/cert-manager.yaml", - ] + helm_upgrade = ( + ["helm"] + + helm_commands + + [ + "--namespace=cert-manager", + "--repo=https://charts.jetstack.io", + "cert-manager", + "cert-manager", + f"--version={version}", + "--values=config/cert-manager.yaml", + ] + ) check_call(helm_upgrade, dry_run) -def patch_coredns(dry_run=False): +def patch_coredns(dry_run=False, diff=False): """Patch coredns resource allocation OVH2 coredns does not have sufficient memory by default after our ban patches """ print(BOLD + GREEN + "Patching coredns resources" + NC, flush=True) + if diff: + print(BOLD + RED + "diff not supported" + NC, flush=True) + return check_call( [ "kubectl", @@ -329,7 +363,7 @@ def patch_coredns(dry_run=False): ) -def deploy_kube_system_charts(release, name=None, dry_run=False): +def deploy_kube_system_charts(release, name=None, dry_run=False, diff=False): """ Some charts must be deployed into the kube-system namespace """ @@ -343,25 +377,41 @@ def deploy_kube_system_charts(release, name=None, dry_run=False): return print(BOLD + GREEN + f"Starting helm upgrade for {log_name}" + NC, flush=True) - helm = [ - "helm", - "upgrade", - "--install", - "--cleanup-on-fail", - "--namespace=kube-system", - name, - "mybinder-kube-system", - ] + if diff: + helm_commands = [ + "diff", + "upgrade", + "--install", + ] + else: + helm_commands = [ + "upgrade", + "--install", + "--cleanup-on-fail", + ] + helm = ( + ["helm"] + + helm_commands + + [ + "--namespace=kube-system", + name, + "mybinder-kube-system", + ] + ) for config_file in config_files: helm.extend(["-f", config_file]) check_call(helm, dry_run) print( - BOLD + GREEN + f"SUCCESS: Helm upgrade for {log_name} completed" + NC, + BOLD + + GREEN + + f"SUCCESS: Helm {helm_commands[0]} for {log_name} completed" + + NC, flush=True, ) - wait_for_deployments_daemonsets("kube-system", dry_run) + if not diff: + wait_for_deployments_daemonsets("kube-system", dry_run) def main(): @@ -398,6 +448,11 @@ def main(): action="store_true", help="Print commands, but don't run them", ) + argparser.add_argument( + "--diff", + action="store_true", + help="Run helm/kubectl diff (plugins must be installed), do not make any changes", + ) stages = ["all", "auth", "networkbans", "kubesystem", "certmanager", "mybinder"] argparser.add_argument( "--stage", @@ -442,7 +497,7 @@ def main(): if args.stage in ("all", "auth"): if cluster.startswith("ovh"): setup_auth_ovh(args.release, cluster, args.dry_run) - patch_coredns(args.dry_run) + patch_coredns(args.dry_run, args.diff) elif cluster in AZURE_RGs: setup_auth_azure(cluster, args.dry_run) elif cluster in GCP_PROJECTS: @@ -453,13 +508,13 @@ def main(): raise Exception("Cloud cluster not recognised!") if args.stage in ("all", "networkban"): - update_networkbans(cluster, args.dry_run) + update_networkbans(cluster, args.dry_run, args.diff) if args.stage in ("all", "kubesystem"): - deploy_kube_system_charts(args.release, args.name, args.dry_run) + deploy_kube_system_charts(args.release, args.name, args.dry_run, args.diff) if args.stage in ("all", "certmanager"): - setup_certmanager(args.dry_run) + setup_certmanager(args.dry_run, args.diff) if args.stage in ("all", "mybinder"): - deploy(args.release, args.name, args.dry_run) + deploy(args.release, args.name, args.dry_run, args.diff) if __name__ == "__main__": From 20c3b38858fe0bcd8258451007e2b573aa9c8657 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Thu, 5 Oct 2023 20:27:38 +0100 Subject: [PATCH 17/32] AWS EKS defaults to running DNS on 172.20.0.10 --- mybinder/templates/netpol.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mybinder/templates/netpol.yaml b/mybinder/templates/netpol.yaml index b9cb73001..b2ab2ab92 100644 --- a/mybinder/templates/netpol.yaml +++ b/mybinder/templates/netpol.yaml @@ -39,6 +39,9 @@ spec: to: - ipBlock: cidr: 10.0.0.0/8 + - ipBlock: + # AWS EKS defaults to running DNS on 172.20.0.10 + cidr: 172.20.0.10/32 # allow access to the world, # but not the cluster - ports: From c235f3e29675bbc59ce1a266f9e5255889f4bf8a Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 6 Oct 2023 22:35:59 +0100 Subject: [PATCH 18/32] Curvenote: workarounds for AWS EKS VPC CNI network policy implementation --- config/curvenote.yaml | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index c4d13a369..ca155c531 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -169,6 +169,23 @@ binderhub: extra_pod_config: enableServiceLinks: false image_pull_policy: Always + networkPolicy: + # enabled: false + ingress: + # AWS VPC CNI only works if the name of the service port name is the same as + # the name of the pod port and the port number is the same + # https://docs.aws.amazon.com/eks/latest/userguide/cni-network-policy.html#cni-network-policy-considerations + - from: + - podSelector: + matchLabels: + hub.jupyter.org/network-access-hub: "true" + # For unknown reasons the hub <-> notebook traffic is partially blocked if + # this is included: + # ports: + # # service/hub port name is "hub" + # # pod/hub port name is "http" + # - port: 8081 + # protocol: TCP singleuser: initContainers: @@ -188,6 +205,20 @@ binderhub: capabilities: add: - NET_ADMIN + networkPolicy: + ingress: + # AWS VPC CNI only works if the name of the service port name is the same as + # the name of the pod port and the port number is the same + # https://docs.aws.amazon.com/eks/latest/userguide/cni-network-policy.html#cni-network-policy-considerations + - from: + - podSelector: + matchLabels: + hub.jupyter.org/network-access-singleuser: "true" + ports: + # proxy/pod port name is "notebook-port" + # I've no idea why that doesn't work + - port: 8888 + protocol: TCP proxy: chp: @@ -197,6 +228,29 @@ binderhub: cpu: "1" limits: cpu: "1" + networkPolicy: + ingress: + # AWS VPC CNI only works if the name of the service port name is the same as + # the name of the pod port and the port number is the same + # https://docs.aws.amazon.com/eks/latest/userguide/cni-network-policy.html#cni-network-policy-considerations + - from: + - podSelector: + matchLabels: + hub.jupyter.org/network-access-proxy-api: "true" + ports: + # service/proxy-api port doesn't have a name + # proxy/pod port name is "api" + - port: 8001 + protocol: TCP + - from: + ports: + # service/proxy-public port is 80 + # proxy/pod port is 8000 + - port: 8000 + protocol: TCP + - port: 80 + protocol: TCP + ingress: hosts: - hub.binder.curvenote.dev From f222b83ddace148f40bf003d60eeb662364de8ec Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 6 Oct 2023 22:39:08 +0100 Subject: [PATCH 19/32] Add priority classes for core BinderHub pods --- config/curvenote.yaml | 7 +++++++ mybinder/templates/priorityclass.yaml | 11 +++++++++++ mybinder/values.yaml | 4 ++++ 3 files changed, 22 insertions(+) create mode 100644 mybinder/templates/priorityclass.yaml diff --git a/config/curvenote.yaml b/config/curvenote.yaml index ca155c531..f61781277 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -169,6 +169,8 @@ binderhub: extra_pod_config: enableServiceLinks: false image_pull_policy: Always + extraPodSpec: + priorityClassName: binderhub-core networkPolicy: # enabled: false ingress: @@ -228,6 +230,8 @@ binderhub: cpu: "1" limits: cpu: "1" + extraPodSpec: + priorityClassName: binderhub-core networkPolicy: ingress: # AWS VPC CNI only works if the name of the service port name is the same as @@ -339,6 +343,9 @@ minesweeper: enabled: false image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5220.hbe9f3f64 +priorityClasses: + binderhub-core: 10000 + binderhub-container-registry-helper: enabled: true auth_token: secret-token-use-existing-secret-instead diff --git a/mybinder/templates/priorityclass.yaml b/mybinder/templates/priorityclass.yaml new file mode 100644 index 000000000..26e81ce8f --- /dev/null +++ b/mybinder/templates/priorityclass.yaml @@ -0,0 +1,11 @@ +{{- range $name, $priority := .Values.priorityClasses -}} +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{ $name }} + labels: + heritage: {{ $.Release.Service }} + release: {{ $.Release.Name }} +value: {{ $priority }} +globalDefault: false +{{- end }} diff --git a/mybinder/values.yaml b/mybinder/values.yaml index 9962a60ad..058f69968 100644 --- a/mybinder/values.yaml +++ b/mybinder/values.yaml @@ -601,6 +601,10 @@ minesweeper: cluster-autoscaler: enabled: false +# Name:Priority pairs of priority classes to create +# https://kubernetes.io/blog/2023/01/12/protect-mission-critical-pods-priorityclass/ +priorityClasses: {} + binderhub-container-registry-helper: enabled: false From abcd1f0b534e50cf986c6b86812a7c8f4c92cb65 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 6 Oct 2023 22:43:17 +0100 Subject: [PATCH 20/32] static.binder.curvenote.dev --- config/curvenote.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index f61781277..f236036a7 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -330,8 +330,7 @@ ingress-nginx: static: ingress: hosts: - # - static.binder.curvenote.dev - - static.3.13.147.101.nip.io + - static.binder.curvenote.dev analyticsPublisher: image: From 7973a0f57dcdc4418b9c07101ea9935e54c49f81 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 6 Oct 2023 23:42:21 +0100 Subject: [PATCH 21/32] Enable minesweeper cryptnono --- config/curvenote.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index f236036a7..5ba1bac8e 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -280,7 +280,7 @@ binderhub: imageGCThresholdType: "absolute" cryptnono: - enabled: false + enabled: true grafana: enabled: false @@ -339,7 +339,7 @@ analyticsPublisher: minesweeper: # Requires secrets - enabled: false + enabled: true image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5220.hbe9f3f64 priorityClasses: From c5f3c642dc89a85c78546f3c5d001eabbc5ec402 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Fri, 6 Oct 2023 23:42:34 +0100 Subject: [PATCH 22/32] prometheus --- config/curvenote.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 5ba1bac8e..92d7dae8c 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -303,22 +303,19 @@ grafana: access: direct isDefault: true editable: false - persistence: - storageClassName: csi-cinder-high-speed prometheus: - enabled: false + enabled: true server: - # nodeSelector: *coreNodeSelector persistentVolume: size: 50Gi retention: 30d ingress: hosts: - # - prometheus.curvenote.mybinder.org + - prometheus.curvenote.mybinder.org tls: - hosts: - # - prometheus.curvenote.mybinder.org + - prometheus.curvenote.mybinder.org secretName: kubelego-tls-prometheus ingress-nginx: From 65b4d6d492a3525f616f47af6ddbebe19bd287cb Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 00:15:45 +0100 Subject: [PATCH 23/32] Remove some commented config from curvenote --- config/curvenote.yaml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 92d7dae8c..4466f0a88 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -1,18 +1,11 @@ projectName: curvenote -# userNodeSelector: &userNodeSelector -# mybinder.org/pool-type: users -# coreNodeSelector: &coreNodeSelector -# mybinder.org/pool-type: core - binderhub: config: BinderHub: hub_url: https://hub.binder.curvenote.dev hub_url_local: http://proxy-public badge_base_url: https://mybinder.org - # build_node_selector: - # mybinder.org/pool-type: builds sticky_builds: true image_prefix: 166088433508.dkr.ecr.us-east-2.amazonaws.com/binderhub/ # log_level: DEBUG @@ -31,7 +24,6 @@ binderhub: password: "" replicas: 1 - # nodeSelector: *coreNodeSelector # extraVolumes: # - name: secrets @@ -156,10 +148,6 @@ binderhub: - binder.curvenote.dev jupyterhub: - # singleuser: - # nodeSelector: *userNodeSelector - # hub: - # nodeSelector: *coreNodeSelector hub: db: pvc: @@ -172,7 +160,6 @@ binderhub: extraPodSpec: priorityClassName: binderhub-core networkPolicy: - # enabled: false ingress: # AWS VPC CNI only works if the name of the service port name is the same as # the name of the pod port and the port number is the same @@ -224,7 +211,6 @@ binderhub: proxy: chp: - # nodeSelector: *coreNodeSelector resources: requests: cpu: "1" @@ -268,7 +254,6 @@ binderhub: replicas: 50 userScheduler: enabled: false - # nodeSelector: *coreNodeSelector cull: # maxAge: 1 hour since we're just testing maxAge: 3600 @@ -284,7 +269,6 @@ cryptnono: grafana: enabled: false - # nodeSelector: *coreNodeSelector ingress: hosts: # - grafana.curvenote.mybinder.org @@ -335,7 +319,6 @@ analyticsPublisher: tag: 2020.12.4-0.dev.git.5220.hdf4d139f minesweeper: - # Requires secrets enabled: true image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5220.hbe9f3f64 From a8bf805d992929f3a7358317274d3cbbdd2a8cce Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 00:17:16 +0100 Subject: [PATCH 24/32] curvenote: use more defaults from values.yaml --- config/curvenote.yaml | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 4466f0a88..5f70d6d3d 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -23,8 +23,6 @@ binderhub: username: "" password: "" - replicas: 1 - # extraVolumes: # - name: secrets # secret: @@ -39,6 +37,7 @@ binderhub: extraConfig: 01-eventlog: | # Disabled until GOOGLE_APPLICATION_CREDENTIALS secret is available + # and secrets/events-archiver/curvenote.json is created 10-external-registry-helper: | # from binderhub.registry import ExternalRegistryHelper @@ -134,15 +133,6 @@ binderhub: c.ExternalRegistryHelper.service_url = "http://curvenote-binderhub-container-registry-helper:8080" c.ExternalRegistryHelper.auth_token = "secret-token-use-existing-secret-instead" - dind: - resources: - requests: - cpu: "4" - memory: 16Gi - limits: - cpu: "7" - memory: 24Gi - ingress: hosts: - binder.curvenote.dev @@ -211,11 +201,6 @@ binderhub: proxy: chp: - resources: - requests: - cpu: "1" - limits: - cpu: "1" extraPodSpec: priorityClassName: binderhub-core networkPolicy: @@ -251,7 +236,6 @@ binderhub: scheduling: userPlaceholder: enabled: false - replicas: 50 userScheduler: enabled: false cull: @@ -259,6 +243,7 @@ binderhub: maxAge: 3600 imageCleaner: + enabled: true # Use 40GB as upper limit, size is given in bytes imageGCThresholdHigh: 40e9 imageGCThresholdLow: 30e9 From a399120c68eb91983cbbd98b497f675a6124023a Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 13:33:24 +0100 Subject: [PATCH 25/32] Binderhub: get ECR helper token from curvenote-binderhub-container-registry-helper secret --- config/curvenote.yaml | 35 ++++++++++++++++++++++++++--------- mybinder/Chart.yaml | 2 +- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 5f70d6d3d..feb042ca6 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -18,6 +18,16 @@ binderhub: LaunchQuota: total_quota: 10 + ExternalRegistryHelper: + service_url: http://curvenote-binderhub-container-registry-helper:8080 + + extraEnv: + BINDERHUB_CONTAINER_REGISTRY_HELPER_AUTH_TOKEN: + valueFrom: + secretKeyRef: + name: curvenote-binderhub-container-registry-helper + key: auth_token + registry: url: 166088433508.dkr.ecr.us-east-2.amazonaws.com username: "" @@ -40,15 +50,14 @@ binderhub: # and secrets/events-archiver/curvenote.json is created 10-external-registry-helper: | - # from binderhub.registry import ExternalRegistryHelper import json + from os import getenv from tornado import httpclient from traitlets import Unicode from binderhub.registry import DockerRegistry class ExternalRegistryHelper(DockerRegistry): - service_url = Unicode( "http://binderhub-container-registry-helper:8080", allow_none=False, @@ -57,7 +66,7 @@ binderhub: ) auth_token = Unicode( - "secret-token", + getenv("BINDERHUB_CONTAINER_REGISTRY_HELPER_AUTH_TOKEN"), help="The auth token to use when accessing the registry helper micro-service.", config=True, ) @@ -117,6 +126,12 @@ binderhub: return None async def get_credentials(self, image, tag): + """ + Get the registry credentials for the given image and tag if supported + by the remote helper, otherwise returns None + + Returns a dictionary of login fields. + """ token_url = f"/token/{image}:{tag}" self.log.debug(f"Getting registry token: {token_url}") token_json = None @@ -125,13 +140,16 @@ binderhub: except httpclient.HTTPError as e: if e.code != 404: raise - token = dict((k, v) for (k, v) in token_json.items() if k in ["username", "password", "registry"]) - self.log.debug(f"Returning registry token: {token}") + self.log.debug(f"Token: {*token_json.keys(),}") + token = dict( + (k, v) + for (k, v) in token_json.items() + if k in ["username", "password", "registry"] + ) return token + c.BinderHub.registry_class = ExternalRegistryHelper - c.ExternalRegistryHelper.service_url = "http://curvenote-binderhub-container-registry-helper:8080" - c.ExternalRegistryHelper.auth_token = "secret-token-use-existing-secret-instead" ingress: hosts: @@ -312,8 +330,7 @@ priorityClasses: binderhub-container-registry-helper: enabled: true - auth_token: secret-token-use-existing-secret-instead - # auth_existing_secret_name: + # auth_token: Autogenerated replicaCount: 2 serviceAccount: name: binderhub-container-registry-helper diff --git a/mybinder/Chart.yaml b/mybinder/Chart.yaml index e7c1ffd5c..2d2903b51 100644 --- a/mybinder/Chart.yaml +++ b/mybinder/Chart.yaml @@ -67,6 +67,6 @@ dependencies: # Registry helper, used to create container repositories before pushing and to # fetch dynamic registry credentials - name: binderhub-container-registry-helper - version: 0.2.2 + version: 0.2.3 repository: oci://quay.io/manics/helm-charts condition: binderhub-container-registry-helper.enabled From baab6b7fc2e23c1a133f3951f1f1956664032f63 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sat, 7 Oct 2023 20:48:10 +0100 Subject: [PATCH 26/32] quay.io rate-limits GH IPs, use GH pages instead --- mybinder/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mybinder/Chart.yaml b/mybinder/Chart.yaml index 2d2903b51..9764f1c9c 100644 --- a/mybinder/Chart.yaml +++ b/mybinder/Chart.yaml @@ -68,5 +68,5 @@ dependencies: # fetch dynamic registry credentials - name: binderhub-container-registry-helper version: 0.2.3 - repository: oci://quay.io/manics/helm-charts + repository: https://www.manicstreetpreacher.co.uk/binderhub-container-registry-helper/ condition: binderhub-container-registry-helper.enabled From 964e329bcc53461a9edc63911fbe23e994756f3c Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 12:40:05 +0100 Subject: [PATCH 27/32] Fix priority class separator --- mybinder/templates/priorityclass.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mybinder/templates/priorityclass.yaml b/mybinder/templates/priorityclass.yaml index 26e81ce8f..2c3aa486c 100644 --- a/mybinder/templates/priorityclass.yaml +++ b/mybinder/templates/priorityclass.yaml @@ -1,4 +1,5 @@ -{{- range $name, $priority := .Values.priorityClasses -}} +{{- range $name, $priority := .Values.priorityClasses }} +--- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: From 309d537c17688a97fe28d727d8ffd194539d0684 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 12:40:23 +0100 Subject: [PATCH 28/32] Remove duplicate properties, fix curvenote token config --- config/curvenote.yaml | 8 ++++---- .../templates/aws-ecr-registry-cleaner/deployment.yaml | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index feb042ca6..698182bbf 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -87,8 +87,7 @@ binderhub: except httpclient.HTTPError as e: if e.code == 404: return None - else: - raise + raise async def get_image_manifest(self, image, tag): """ @@ -138,8 +137,9 @@ binderhub: try: token_json = await self._request(token_url, method="POST", body="") except httpclient.HTTPError as e: - if e.code != 404: - raise + if e.code == 404: + return None + raise self.log.debug(f"Token: {*token_json.keys(),}") token = dict( (k, v) diff --git a/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml b/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml index ed440f8f5..7a4ffda39 100644 --- a/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml +++ b/mybinder/templates/aws-ecr-registry-cleaner/deployment.yaml @@ -18,7 +18,6 @@ spec: app: aws-ecr-registry-cleaner component: aws-ecr-registry-cleaner release: {{ .Release.Name }} - replicas: 1 template: metadata: labels: From 59f3b871eb1ea82312449663a58841559ae722fa Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:46:01 +0100 Subject: [PATCH 29/32] Fix domain suffix for prometheus curvenote --- config/curvenote.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 698182bbf..1854c84c5 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -274,10 +274,10 @@ grafana: enabled: false ingress: hosts: - # - grafana.curvenote.mybinder.org + # - grafana.binder.curvenote.dev tls: - hosts: - # - grafana.curvenote.mybinder.org + # - grafana.binder.curvenote.dev secretName: kubelego-tls-grafana datasources: datasources.yaml: @@ -286,7 +286,7 @@ grafana: - name: prometheus orgId: 1 type: prometheus - url: https://prometheus.curvenote.mybinder.org + url: https://prometheus.binder.curvenote.dev access: direct isDefault: true editable: false @@ -299,10 +299,10 @@ prometheus: retention: 30d ingress: hosts: - - prometheus.curvenote.mybinder.org + - prometheus.binder.curvenote.dev tls: - hosts: - - prometheus.curvenote.mybinder.org + - prometheus.binder.curvenote.dev secretName: kubelego-tls-prometheus ingress-nginx: From 1a1de381601fdb94536e3d31b857b75569ecb7d6 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:50:58 +0100 Subject: [PATCH 30/32] curvenote: Remove hard-coded images (used for manual deploy) --- config/curvenote.yaml | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/config/curvenote.yaml b/config/curvenote.yaml index 1854c84c5..d2b3ab591 100644 --- a/config/curvenote.yaml +++ b/config/curvenote.yaml @@ -185,23 +185,6 @@ binderhub: # protocol: TCP singleuser: - initContainers: - - name: tc-init - image: jupyterhub/mybinder.org-tc-init:2020.12.4-0.dev.git.4289.h140cef52 - imagePullPolicy: IfNotPresent - env: - - name: WHITELIST_CIDR - value: 10.0.0.0/8 - - name: EGRESS_BANDWIDTH - value: 1mbit - securityContext: - # capabilities.add seems to be disabled - # by the `runAsUser: 1000` in the pod-level securityContext - # unless we explicitly run as root - runAsUser: 0 - capabilities: - add: - - NET_ADMIN networkPolicy: ingress: # AWS VPC CNI only works if the name of the service port name is the same as @@ -317,13 +300,10 @@ static: - static.binder.curvenote.dev analyticsPublisher: - image: - # name: jupyterhub/mybinder.org-analytics-publisher - tag: 2020.12.4-0.dev.git.5220.hdf4d139f + enabled: false minesweeper: enabled: true - image: jupyterhub/mybinder.org-minesweeper:2020.12.4-0.dev.git.5220.hbe9f3f64 priorityClasses: binderhub-core: 10000 From 5fc359156e8843a7d4ce7f157efcbbcfc3d21301 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:51:31 +0100 Subject: [PATCH 31/32] Revert "Add diff option to deploy.py" This reverts commit 12617a15d825d9373e4f5a8719639a448bb0e59d. --- deploy.py | 147 +++++++++++++++++------------------------------------- 1 file changed, 46 insertions(+), 101 deletions(-) diff --git a/deploy.py b/deploy.py index 9fac1a747..1c2e0c0b6 100755 --- a/deploy.py +++ b/deploy.py @@ -10,12 +10,11 @@ # Color codes for colored output! if os.environ.get("TERM"): BOLD = subprocess.check_output(["tput", "bold"]).decode() - RED = subprocess.check_output(["tput", "setaf", "1"]).decode() GREEN = subprocess.check_output(["tput", "setaf", "2"]).decode() NC = subprocess.check_output(["tput", "sgr0"]).decode() else: # no term, no colors - BOLD = RED = GREEN = NC = "" + BOLD = GREEN = NC = "" HERE = os.path.dirname(__file__) ABSOLUTE_HERE = os.path.dirname(os.path.realpath(__file__)) @@ -196,7 +195,7 @@ def get_config_files(release, config_dir="config"): return config_files -def deploy(release, name=None, dry_run=False, diff=False): +def deploy(release, name=None, dry_run=False): """Deploys a federation member to a k8s cluster. Waits for deployments and daemonsets to become Ready @@ -204,30 +203,17 @@ def deploy(release, name=None, dry_run=False, diff=False): if not name: name = release - if diff: - helm_commands = [ - "diff", - "upgrade", - "--install", - ] - else: - helm_commands = [ - "upgrade", - "--install", - "--cleanup-on-fail", - "--create-namespace", - ] - print(BOLD + GREEN + f"Starting helm upgrade for {release}" + NC, flush=True) - helm = ( - ["helm"] - + helm_commands - + [ - f"--namespace={name}", - name, - "mybinder", - ] - ) + helm = [ + "helm", + "upgrade", + "--install", + "--cleanup-on-fail", + "--create-namespace", + f"--namespace={name}", + name, + "mybinder", + ] config_files = get_config_files(release) # add config files to helm command @@ -236,12 +222,10 @@ def deploy(release, name=None, dry_run=False, diff=False): check_call(helm, dry_run) print( - BOLD + GREEN + f"SUCCESS: Helm {helm_commands[0]} for {release} completed" + NC, - flush=True, + BOLD + GREEN + f"SUCCESS: Helm upgrade for {release} completed" + NC, flush=True ) - if not diff: - wait_for_deployments_daemonsets(name, dry_run) + wait_for_deployments_daemonsets(name, dry_run) def wait_for_deployments_daemonsets(name, dry_run=False): @@ -285,7 +269,7 @@ def wait_for_deployments_daemonsets(name, dry_run=False): ) -def setup_certmanager(dry_run=False, diff=False): +def setup_certmanager(dry_run=False): """ Install cert-manager separately into its own namespace and `kubectl apply` its CRDs each time as helm won't attempt to handle changes to CRD resources. @@ -302,50 +286,32 @@ def setup_certmanager(dry_run=False, diff=False): manifest_url = f"https://github.com/jetstack/cert-manager/releases/download/{version}/cert-manager.crds.yaml" print(BOLD + GREEN + f"Installing cert-manager CRDs {version}" + NC, flush=True) - if diff: - kubectl_commands = ["diff"] - helm_commands = [ - "diff", - "upgrade", - "--install", - ] - else: - kubectl_commands = ["apply"] - helm_commands = [ - "upgrade", - "--install", - "--create-namespace", - ] - # Sometimes 'replace' is needed for upgrade (e.g. 1.1->1.2) - check_call(["kubectl"] + kubectl_commands + ["-f", manifest_url], dry_run) + check_call(["kubectl", "apply", "-f", manifest_url], dry_run) print(BOLD + GREEN + f"Installing cert-manager {version}" + NC, flush=True) - helm_upgrade = ( - ["helm"] - + helm_commands - + [ - "--namespace=cert-manager", - "--repo=https://charts.jetstack.io", - "cert-manager", - "cert-manager", - f"--version={version}", - "--values=config/cert-manager.yaml", - ] - ) + helm_upgrade = [ + "helm", + "upgrade", + "--install", + "--create-namespace", + "--namespace=cert-manager", + "--repo=https://charts.jetstack.io", + "cert-manager", + "cert-manager", + f"--version={version}", + "--values=config/cert-manager.yaml", + ] check_call(helm_upgrade, dry_run) -def patch_coredns(dry_run=False, diff=False): +def patch_coredns(dry_run=False): """Patch coredns resource allocation OVH2 coredns does not have sufficient memory by default after our ban patches """ print(BOLD + GREEN + "Patching coredns resources" + NC, flush=True) - if diff: - print(BOLD + RED + "diff not supported" + NC, flush=True) - return check_call( [ "kubectl", @@ -363,7 +329,7 @@ def patch_coredns(dry_run=False, diff=False): ) -def deploy_kube_system_charts(release, name=None, dry_run=False, diff=False): +def deploy_kube_system_charts(release, name=None, dry_run=False): """ Some charts must be deployed into the kube-system namespace """ @@ -377,41 +343,25 @@ def deploy_kube_system_charts(release, name=None, dry_run=False, diff=False): return print(BOLD + GREEN + f"Starting helm upgrade for {log_name}" + NC, flush=True) - if diff: - helm_commands = [ - "diff", - "upgrade", - "--install", - ] - else: - helm_commands = [ - "upgrade", - "--install", - "--cleanup-on-fail", - ] - helm = ( - ["helm"] - + helm_commands - + [ - "--namespace=kube-system", - name, - "mybinder-kube-system", - ] - ) + helm = [ + "helm", + "upgrade", + "--install", + "--cleanup-on-fail", + "--namespace=kube-system", + name, + "mybinder-kube-system", + ] for config_file in config_files: helm.extend(["-f", config_file]) check_call(helm, dry_run) print( - BOLD - + GREEN - + f"SUCCESS: Helm {helm_commands[0]} for {log_name} completed" - + NC, + BOLD + GREEN + f"SUCCESS: Helm upgrade for {log_name} completed" + NC, flush=True, ) - if not diff: - wait_for_deployments_daemonsets("kube-system", dry_run) + wait_for_deployments_daemonsets("kube-system", dry_run) def main(): @@ -448,11 +398,6 @@ def main(): action="store_true", help="Print commands, but don't run them", ) - argparser.add_argument( - "--diff", - action="store_true", - help="Run helm/kubectl diff (plugins must be installed), do not make any changes", - ) stages = ["all", "auth", "networkbans", "kubesystem", "certmanager", "mybinder"] argparser.add_argument( "--stage", @@ -497,7 +442,7 @@ def main(): if args.stage in ("all", "auth"): if cluster.startswith("ovh"): setup_auth_ovh(args.release, cluster, args.dry_run) - patch_coredns(args.dry_run, args.diff) + patch_coredns(args.dry_run) elif cluster in AZURE_RGs: setup_auth_azure(cluster, args.dry_run) elif cluster in GCP_PROJECTS: @@ -508,13 +453,13 @@ def main(): raise Exception("Cloud cluster not recognised!") if args.stage in ("all", "networkban"): - update_networkbans(cluster, args.dry_run, args.diff) + update_networkbans(cluster, args.dry_run) if args.stage in ("all", "kubesystem"): - deploy_kube_system_charts(args.release, args.name, args.dry_run, args.diff) + deploy_kube_system_charts(args.release, args.name, args.dry_run) if args.stage in ("all", "certmanager"): - setup_certmanager(args.dry_run, args.diff) + setup_certmanager(args.dry_run) if args.stage in ("all", "mybinder"): - deploy(args.release, args.name, args.dry_run, args.diff) + deploy(args.release, args.name, args.dry_run) if __name__ == "__main__": From 404277566c3a5ec1ff34c227f91ca680d59321f6 Mon Sep 17 00:00:00 2001 From: Simon Li Date: Sun, 8 Oct 2023 13:56:25 +0100 Subject: [PATCH 32/32] Remove unneeded section about testing netpol in curvenote readme --- terraform/aws/curvenote/README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/terraform/aws/curvenote/README.md b/terraform/aws/curvenote/README.md index fb706935b..c0edd26ec 100644 --- a/terraform/aws/curvenote/README.md +++ b/terraform/aws/curvenote/README.md @@ -146,12 +146,3 @@ All access to the Kubernetes cluster is managed using [GitHub OIDC](https://docs AWS secret tokens are not required. AWS API access for BinderHub components, for example to ECR, is managed using [IRSA](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). - -## Testing network policies - -``` -git clone https://github.com/jupyterhub/action-k3s-helm -helm upgrade --install netpol -ntest --create-namespace ./action-k3s-helm/test-netpol-enforcement/ -helm test -ntest netpol -helm delete -ntest netpol -```