diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..5afdf958f --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +root = true + +[ansible/**] +charset = utf-8 +end_of_line = lf +indent_size = 2 +indent_style = space +insert_final_newline = true diff --git a/.gitattributes b/.gitattributes index 2376958bf..9208e10ac 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ **/secrets/** filter=git-crypt diff=git-crypt +# GitHub syntax highlighting +pixi.lock linguist-language=YAML linguist-generated=true diff --git a/.gitignore b/.gitignore index f62deeeb4..3f0fa6888 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,6 @@ travis/crypt-key env .terraform +# pixi environments +.pixi +*.egg-info diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..0f9b3fd11 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,150 @@ +variables: + GIT_STRATEGY: clone + GIT_CLEAN_FLAGS: "-ffdx" + +stages: + - lint + - deploy-acceptance-ansible + - deploy-acceptance-helm + - test-acceptance + - deploy-production-nginx + - deploy-production-helm + +.gesis-manual-web: + rules: + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == 'web' + when: manual + allow_failure: true + +.gesis-merge-request: + rules: + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "merge_request_event" + changes: + - .gitlab.yml + when: manual + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "merge_request_event" + changes: + - ansible/**/* + - mybinder/**/* + - config/**/* + - secrets/**/* + +.gesis-push-main: + rules: + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == 'main' + +.gesis-push-gesis: + rules: + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == 'gesis' + +include: + - component: $CI_SERVER_FQDN/rse/docker/images/ansible/ansible-lint@10.2.6 + inputs: + stage: lint + dir: ansible + + - component: $CI_SERVER_FQDN/rse/docker/images/ansible/ansible-deploy@10.2.6 + inputs: + stage: deploy-acceptance-ansible + dir: ansible + inventory: gesis-acceptance + playbook: gesis.yml + ssh-user: ansible + ssh-key-type: ed25519 + rules: + - if: $CI_SERVER_HOST == "git.gesis.org" && $CI_PIPELINE_SOURCE == "merge_request_event" + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == 'main' + - if: $CI_SERVER_HOST == 'git.gesis.org' && $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH == 'gesis' + +.gesis helm deploy: + image: + name: docker-private.gesis.intra/gesis/ilcm/orc2/k8s:latest + entrypoint: [""] + rules: + - if: $CI_SERVER_HOST == "git.gesis.org" + variables: + HELM_ENVIRONMENT: template + script: + - cat $git_crypt_secret_key | base64 -d > git_crypt_secret_key + - git-crypt unlock git_crypt_secret_key + - kubectl config use-context ${CI_PROJECT_PATH}:${HELM_ENVIRONMENT} + - helm version + - | + for d in ./mybinder*/; do + helm dependency update "$d"; + done + # - | + # for chart in mybinder-kube-system mybinder-tigera-operator; do + # helm upgrade \ + # ${chart:9} ./${chart} \ + # --cleanup-on-fail \ + # --create-namespace \ + # --history-max 3 \ + # --install \ + # --namespace=${chart}; + # done + - | + helm lint ./mybinder \ + --values ./config/gesis-${HELM_ENVIRONMENT}.yaml \ + --values ./secrets/config/common/common.yaml \ + --values ./secrets/config/common/cryptnono.yaml \ + --values ./secrets/config/common/gesis.yaml \ + --values ./secrets/config/gesis-${HELM_ENVIRONMENT}.yaml + - | + helm upgrade \ + binderhub ./mybinder \ + --cleanup-on-fail \ + --create-namespace \ + --history-max 3 \ + --install \ + --namespace=gesis \ + --render-subchart-notes \ + --values ./config/gesis-${HELM_ENVIRONMENT}.yaml \ + --values ./secrets/config/common/common.yaml \ + --values ./secrets/config/common/cryptnono.yaml \ + --values ./secrets/config/common/gesis.yaml \ + --values ./secrets/config/gesis-${HELM_ENVIRONMENT}.yaml + +gesis helm acceptance deploy: + resource_group: acceptance + stage: deploy-acceptance-helm + rules: + - !reference [.gesis-manual-web, rules] + - !reference [.gesis-merge-request, rules] + - !reference [.gesis-push-main, rules] + - !reference [.gesis-push-gesis, rules] + variables: + HELM_ENVIRONMENT: acceptance + extends: + - .gesis helm deploy + +.gesis helm production deploy: + resource_group: production + stage: deploy-production-helm + rules: + - !reference [.gesis-manual-web, rules] + - !reference [.gesis-push-main, rules] + - !reference [.gesis-push-gesis, rules] + variables: + HELM_ENVIRONMENT: stage + extends: + - .gesis helm deploy + +.smoke test: + stage: test-acceptance + variables: + INTERACTIVE_URL: url + script: + - curl $INTERACTIVE_URL + +smoke test to acceptance cluster: + stage: test-acceptance + rules: + - !reference [.gesis-manual-web, rules] + - !reference [.gesis-merge-request, rules] + - !reference [.gesis-push-main, rules] + - !reference [.gesis-push-gesis, rules] + variables: + INTERACTIVE_URL: https://notebooks-test.gesis.org/binder/ + extends: + - .smoke test \ No newline at end of file diff --git a/.gitlab/agents/stage/config.yaml b/.gitlab/agents/stage/config.yaml new file mode 100644 index 000000000..c402b2fa8 --- /dev/null +++ b/.gitlab/agents/stage/config.yaml @@ -0,0 +1,3 @@ +ci_access: + projects: + - id: methods-hub/interactive-environment diff --git a/ansible/gesis.yml b/ansible/gesis.yml new file mode 100644 index 000000000..b26cdb02d --- /dev/null +++ b/ansible/gesis.yml @@ -0,0 +1,46 @@ +- name: Configure servers that are part of Kubernetes cluster + hosts: all + gather_facts: false + become: true + roles: + - k8s-common +- name: Configure Kubernetes control panel + hosts: kubernetes_control_panel + gather_facts: false + become: true + roles: + - role: k8s-control-panel + vars: + k8s_control_panel_addresses_begin: '{{ K8S_INGRESS }}' + k8s_control_panel_addresses_end: '{{ K8S_INGRESS }}' +- name: Configure Kubernetes workers + hosts: kubernetes_workers + gather_facts: false + become: true + roles: + - k8s-worker +- name: Configure Kubernetes Persistent Volumes + hosts: kubernetes_control_panel + gather_facts: false + become: true + roles: + - k8s-pv +- name: Configure Harbor + hosts: kubernetes_control_panel + gather_facts: false + roles: + - role: harbor + vars: + harbor_domain: '{{ HARBOR_DOMAIN }}' + harbor_path: '{{ HARBOR_PATH }}' +- name: Configure JupyterHub workers + hosts: jupyterhub_single_user + gather_facts: false + become: true + roles: + - k8s-worker +- name: Configure mybinder.org Kubernetes cluster + hosts: kubernetes_control_panel + gather_facts: false + roles: + - mybinder diff --git a/ansible/inventories/gesis-acceptance b/ansible/inventories/gesis-acceptance new file mode 100644 index 000000000..3697dab3f --- /dev/null +++ b/ansible/inventories/gesis-acceptance @@ -0,0 +1,51 @@ +[all] +svko-css-backup-node ansible_host=194.95.75.20 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_20 }}' +svko-k8s-test01 ansible_host=194.95.75.21 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_21 }}' +svko-k8s-test02 ansible_host=194.95.75.22 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_22 }}' +svko-k8s-test03 ansible_host=194.95.75.23 ansible_ssh_user=ansible ansible_become_pass='{{ become_pass_194_95_75_23 }}' + +[all:vars] +INVENTORY_NAME=stage +K8S_CONTROL_PLANE_ENDPOINT=194.95.75.21 +K8S_CONTROL_PLANE_ALIAS=svko-k8s-test01 +; Replace this variable with a filter +; This must match the group ingress +K8S_INGRESS=194.95.75.20 + +[notebooks_gesis_org] +svko-css-backup-node + +[kubernetes_control_panel] +svko-k8s-test01 + +[kubernetes_control_panel:vars] +GRAFANA_CAPACITY_STORAGE=2Gi +JUPYTERHUB_CAPACITY_STORAGE=2Gi +PROMETHEUS_CAPACITY_STORAGE=15Gi +HARBOR_DOMAIN=notebooks.gesis.org +HARBOR_PATH='/' + +[kubernetes_workers] +svko-k8s-test02 +svko-k8s-test03 + +[ingress] +svko-css-backup-node + +[harbor] +svko-k8s-test03 + +[binderhub] +svko-k8s-test02 + +[jupyterhub] +svko-k8s-test02 + +[jupyterhub_single_user] +svko-k8s-test02 + +[prometheus] +svko-css-backup-node + +[grafana] +svko-css-backup-node diff --git a/ansible/roles/harbor/tasks/main.yml b/ansible/roles/harbor/tasks/main.yml new file mode 100644 index 000000000..91a7b1fdd --- /dev/null +++ b/ansible/roles/harbor/tasks/main.yml @@ -0,0 +1,201 @@ +- name: Add harbor's repository + kubernetes.core.helm_repository: + name: harbor + repo_url: https://helm.goharbor.io + +- name: Create a storage for jobservice + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ harbor_name }}-jobservice" + labels: + harbor: jobservice + spec: + capacity: + storage: "{{ harbor_jobservice_storage }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Delete + storageClassName: "{{ harbor_storage_class_name }}" + local: + path: /harbor/jobservice + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: harbor + operator: In + values: + - 'true' + +- name: Create a storage for registry + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ harbor_name }}-registry" + labels: + harbor: registry + spec: + capacity: + storage: "{{ harbor_registry_storage }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Delete + storageClassName: "{{ harbor_storage_class_name }}" + local: + path: /harbor/registry + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: harbor + operator: In + values: + - 'true' + +- name: Create a storage for redis + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ harbor_name }}-redis" + labels: + harbor: redis + spec: + capacity: + storage: "{{ harbor_redis_storage }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Delete + storageClassName: "{{ harbor_storage_class_name }}" + local: + path: /harbor/redis + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: harbor + operator: In + values: + - 'true' + +- name: Create a storage for trivy + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ harbor_name }}-trivy" + labels: + harbor: trivy + spec: + capacity: + storage: "{{ harbor_trivy_storage }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Delete + storageClassName: "{{ harbor_storage_class_name }}" + local: + path: /harbor/trivy + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: harbor + operator: In + values: + - 'true' + +- name: Create a storage for database + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ harbor_name }}-database" + labels: + harbor: database + spec: + capacity: + storage: "{{ harbor_database_storage }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Delete + storageClassName: "{{ harbor_storage_class_name }}" + local: + path: /harbor/database + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: harbor + operator: In + values: + - 'true' + +- name: Deploy harbor + kubernetes.core.helm: + chart_ref: harbor/harbor + chart_version: "{{ harbor_version }}" + release_name: "{{ harbor_name }}" + release_namespace: "{{ harbor_namespace }}" + create_namespace: true + history_max: 3 + values: + harborAdminPassword: "{{ HARBOR_ADMIN_PASSWORD }}" + expose: + type: ClusterIP + tls: + enabled: false + persistence: + persistentVolumeClaim: + registry: + storageClass: "{{ harbor_storage_class_name }}" + jobservice: + jobLog: + storageClass: "{{ harbor_storage_class_name }}" + database: + storageClass: "{{ harbor_storage_class_name }}" + redis: + storageClass: "{{ harbor_storage_class_name }}" + trivy: + storageClass: "{{ harbor_storage_class_name }}" + +# Based on https://kubernetes.github.io/ingress-nginx/user-guide/basic-usage/ +- name: Create a ingress resources + kubernetes.core.k8s: + state: present + definition: + apiVersion: networking.k8s.io/v1 + kind: Ingress + metadata: + name: 'ingress-{{ harbor_name }}' + spec: + rules: + - host: '{{ harbor_domain }}' + http: + paths: + - path: '{{ harbor_path }}' + pathType: Prefix + backend: + service: + name: '{{ harbor_name }}' + port: + number: 80 + # Don't change the ingressClassName + ingressClassName: nginx diff --git a/ansible/roles/harbor/vars/main.yml b/ansible/roles/harbor/vars/main.yml new file mode 100644 index 000000000..a7c50e301 --- /dev/null +++ b/ansible/roles/harbor/vars/main.yml @@ -0,0 +1,11 @@ +harbor_version: 1.16.0 +harbor_name: harbor +harbor_namespace: harbor +harbor_storage_class_name: standard +harbor_jobservice_storage: 10Gi +harbor_registry_storage: 10Gi +harbor_redis_storage: 10Gi +harbor_trivy_storage: 10Gi +harbor_database_storage: 10Gi +harbor_domain: harbor.localhost +harbor_path: '/' diff --git a/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml b/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml new file mode 100644 index 000000000..cbe083dae --- /dev/null +++ b/ansible/roles/jupyterhub/files/var/lib/kubelet/config.yaml @@ -0,0 +1,45 @@ +apiVersion: kubelet.config.k8s.io/v1beta1 +authentication: + anonymous: + enabled: false + webhook: + cacheTTL: 0s + enabled: true + x509: + clientCAFile: /etc/kubernetes/pki/ca.crt +authorization: + mode: Webhook + webhook: + cacheAuthorizedTTL: 0s + cacheUnauthorizedTTL: 0s +cgroupDriver: systemd +clusterDNS: + - 10.96.0.10 +clusterDomain: cluster.local +cpuManagerReconcilePeriod: 0s +evictionPressureTransitionPeriod: 0s +fileCheckFrequency: 0s +healthzBindAddress: 127.0.0.1 +healthzPort: 10248 +httpCheckFrequency: 0s +imageMinimumGCAge: 0s +kind: KubeletConfiguration +logging: + flushFrequency: 0 + options: + json: + infoBufferSize: "0" + verbosity: 0 +memorySwap: {} +nodeStatusReportFrequency: 0s +nodeStatusUpdateFrequency: 0s +resolvConf: /run/systemd/resolve/resolv.conf +rotateCertificates: true +runtimeRequestTimeout: 0s +shutdownGracePeriod: 0s +shutdownGracePeriodCriticalPods: 0s +staticPodPath: /etc/kubernetes/manifests +streamingConnectionIdleTimeout: 0s +syncFrequency: 0s +volumeStatsAggPeriod: 0s +maxPods: 500 diff --git a/ansible/roles/jupyterhub/tasks/main.yml b/ansible/roles/jupyterhub/tasks/main.yml new file mode 100644 index 000000000..1409bf70d --- /dev/null +++ b/ansible/roles/jupyterhub/tasks/main.yml @@ -0,0 +1,15 @@ +- name: Stop kubelet service + ansible.builtin.systemd: + name: kubelet + state: stopped +- name: Copy kubelet configuration + ansible.builtin.copy: + src: ../var/lib/kubelet/config.yaml + dest: /var/lib/kubelet/config.yaml + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Restarted kubelet service + ansible.builtin.systemd: + name: kubelet + state: restarted diff --git a/ansible/roles/k8s-common/files/etc/containerd/config.toml b/ansible/roles/k8s-common/files/etc/containerd/config.toml new file mode 100644 index 000000000..320b460aa --- /dev/null +++ b/ansible/roles/k8s-common/files/etc/containerd/config.toml @@ -0,0 +1,250 @@ +disabled_plugins = [] +imports = [] +oom_score = 0 +plugin_dir = "" +required_plugins = [] +root = "/orc2_data/containerd" +state = "/run/containerd" +temp = "" +version = 2 + +[cgroup] + path = "" + +[debug] + address = "" + format = "" + gid = 0 + level = "" + uid = 0 + +[grpc] + address = "/run/containerd/containerd.sock" + gid = 0 + max_recv_message_size = 16777216 + max_send_message_size = 16777216 + tcp_address = "" + tcp_tls_ca = "" + tcp_tls_cert = "" + tcp_tls_key = "" + uid = 0 + +[metrics] + address = "" + grpc_histogram = false + +[plugins] + + [plugins."io.containerd.gc.v1.scheduler"] + deletion_threshold = 0 + mutation_threshold = 100 + pause_threshold = 0.02 + schedule_delay = "0s" + startup_delay = "100ms" + + [plugins."io.containerd.grpc.v1.cri"] + device_ownership_from_security_context = false + disable_apparmor = false + disable_cgroup = false + disable_hugetlb_controller = true + disable_proc_mount = false + disable_tcp_service = true + enable_selinux = false + enable_tls_streaming = false + enable_unprivileged_icmp = false + enable_unprivileged_ports = false + ignore_image_defined_volumes = false + max_concurrent_downloads = 3 + max_container_log_line_size = 16384 + netns_mounts_under_state_dir = false + restrict_oom_score_adj = false + sandbox_image = "registry.k8s.io/pause:3.6" + selinux_category_range = 1024 + stats_collect_period = 10 + stream_idle_timeout = "4h0m0s" + stream_server_address = "127.0.0.1" + stream_server_port = "0" + systemd_cgroup = false + tolerate_missing_hugetlb_controller = true + unset_seccomp_profile = "" + + [plugins."io.containerd.grpc.v1.cri".cni] + bin_dir = "/opt/cni/bin" + conf_dir = "/etc/cni/net.d" + conf_template = "" + ip_pref = "" + max_conf_num = 1 + + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + disable_snapshot_annotations = true + discard_unpacked_layers = false + ignore_rdt_not_enabled_errors = false + no_pivot = false + snapshotter = "overlayfs" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "" + CriuImagePath = "" + CriuPath = "" + CriuWorkPath = "" + IoGid = 0 + IoUid = 0 + NoNewKeyring = false + NoPivotRoot = false + Root = "" + ShimCgroup = "" + SystemdCgroup = true + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".image_decryption] + key_model = "node" + + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "" + + [plugins."io.containerd.grpc.v1.cri".registry.auths] + + [plugins."io.containerd.grpc.v1.cri".registry.configs] + + [plugins."io.containerd.grpc.v1.cri".registry.headers] + + [plugins."io.containerd.grpc.v1.cri".registry.mirrors] + + [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming] + tls_cert_file = "" + tls_key_file = "" + + [plugins."io.containerd.internal.v1.opt"] + path = "/opt/containerd" + + [plugins."io.containerd.internal.v1.restart"] + interval = "10s" + + [plugins."io.containerd.internal.v1.tracing"] + sampling_ratio = 1.0 + service_name = "containerd" + + [plugins."io.containerd.metadata.v1.bolt"] + content_sharing_policy = "shared" + + [plugins."io.containerd.monitor.v1.cgroups"] + no_prometheus = false + + [plugins."io.containerd.runtime.v1.linux"] + no_shim = false + runtime = "runc" + runtime_root = "" + shim = "containerd-shim" + shim_debug = false + + [plugins."io.containerd.runtime.v2.task"] + platforms = ["linux/amd64"] + sched_core = false + + [plugins."io.containerd.service.v1.diff-service"] + default = ["walking"] + + [plugins."io.containerd.service.v1.tasks-service"] + rdt_config_file = "" + + [plugins."io.containerd.snapshotter.v1.aufs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.btrfs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.devmapper"] + async_remove = false + base_image_size = "" + discard_blocks = false + fs_options = "" + fs_type = "" + pool_name = "" + root_path = "" + + [plugins."io.containerd.snapshotter.v1.native"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.overlayfs"] + root_path = "" + upperdir_label = false + + [plugins."io.containerd.snapshotter.v1.zfs"] + root_path = "" + + [plugins."io.containerd.tracing.processor.v1.otlp"] + endpoint = "" + insecure = false + protocol = "" + +[proxy_plugins] + +[stream_processors] + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"] + accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar" + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"] + accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar+gzip" + +[timeouts] + "io.containerd.timeout.bolt.open" = "0s" + "io.containerd.timeout.shim.cleanup" = "5s" + "io.containerd.timeout.shim.load" = "5s" + "io.containerd.timeout.shim.shutdown" = "3s" + "io.containerd.timeout.task.state" = "2s" + +[ttrpc] + address = "" + gid = 0 + uid = 0 diff --git a/ansible/roles/k8s-common/tasks/k8s-repository.yml b/ansible/roles/k8s-common/tasks/k8s-repository.yml new file mode 100644 index 000000000..fc20cd955 --- /dev/null +++ b/ansible/roles/k8s-common/tasks/k8s-repository.yml @@ -0,0 +1,51 @@ +- name: Remove old Kubernetes public GPG key + ansible.builtin.file: + path: /etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg + state: absent +- name: Remove old Kubernetes public GPG key + ansible.builtin.file: + path: /etc/apt/trusted.gpg.d/kubernetes-archive-keyring.asc + state: absent +- name: Remove old Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" + filename: kubernetes + state: absent +- name: Remove old Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/kubernetes.asc] https://pkgs.k8s.io/core:/stable:/{{ item }}/deb/ /" + filename: kubernetes + state: absent + loop: + - v1.27 + - v1.28 + - v1.29 + - v1.30 + - v1.31 +- name: Download Kubernetes public GPG key + ansible.builtin.get_url: + url: "https://pkgs.k8s.io/core:/stable:/v{{ k8s_common_kubernetes_version }}/deb/Release.key" + dest: /tmp/kubernetes-archive-keyring.asc + mode: "0644" + force: true +- name: Convert the public GPG key to binary + ansible.builtin.command: + argv: + - gpg + - --yes + - --dearmor + - --output + - /tmp/kubernetes.gpg + - /tmp/kubernetes-archive-keyring.asc + changed_when: false +- name: Copy GPG key + ansible.builtin.copy: + src: /tmp/kubernetes.gpg + dest: /etc/apt/keyrings/kubernetes.gpg + remote_src: true + mode: "0644" +- name: Add Kubernetes repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/keyrings/kubernetes.gpg] https://pkgs.k8s.io/core:/stable:/v{{ k8s_common_kubernetes_version }}/deb/ /" + filename: kubernetes + state: present diff --git a/ansible/roles/k8s-common/tasks/main.yml b/ansible/roles/k8s-common/tasks/main.yml new file mode 100644 index 000000000..20045b981 --- /dev/null +++ b/ansible/roles/k8s-common/tasks/main.yml @@ -0,0 +1,118 @@ +- name: Create directory /etc/apt/keyrings if it does not exist + ansible.builtin.file: + state: directory + path: /etc/apt/keyrings + owner: root + group: root + mode: u=rwx,g=rx,o=rx +- name: Add Kubernetes repository + ansible.builtin.import_tasks: + file: k8s-repository.yml +- name: Ensure DOCKER_CLIENT_TIMEOUT is set + ansible.builtin.lineinfile: + path: /etc/environment + regexp: "^DOCKER_CLIENT_TIMEOUT=" + line: DOCKER_CLIENT_TIMEOUT=180 +- name: Disable SWAP since kubernetes can't work with swap enabled + ansible.builtin.command: swapoff -a + changed_when: false +- name: Disable SWAP in fstab since kubernetes can't work with swap enabled + ansible.builtin.replace: + path: /etc/fstab + regexp: '^([^#].*?\sswap\s+sw\s+.*)$' + replace: '# \1' +- name: Disable Firewall + ansible.builtin.command: ufw disable + changed_when: false +- name: Allow IP forward + ansible.posix.sysctl: + name: net.ipv4.ip_forward + value: "1" + state: present +- name: Set inotify max user instances + ansible.posix.sysctl: + name: fs.inotify.max_user_instances + value: "1280" + state: present +- name: Set inotify max user watches + ansible.posix.sysctl: + name: fs.inotify.max_user_watches + value: "655360" + state: present +- name: Create directory for Persistent Volume + ansible.builtin.import_tasks: + file: pv.yml +- name: Add Docker public GPG key + ansible.builtin.get_url: + url: https://download.docker.com/linux/ubuntu/gpg + dest: /etc/apt/trusted.gpg.d/docker.asc + mode: "0644" + force: true +- name: Add Docker repository + ansible.builtin.apt_repository: + repo: "deb [signed-by=/etc/apt/trusted.gpg.d/docker.asc] https://download.docker.com/linux/ubuntu jammy stable" + filename: docker + state: present +- name: Add Helm public GPG key + ansible.builtin.get_url: + url: https://baltocdn.com/helm/signing.asc + dest: /etc/apt/trusted.gpg.d/helm.asc + mode: "0644" + force: true +- name: Add Helm repository + ansible.builtin.apt_repository: + repo: "deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/helm.asc] https://baltocdn.com/helm/stable/debian/ all main" + filename: kubernetes + state: present +- name: Install dependencies + ansible.builtin.apt: + update_cache: true + pkg: + - rsync + - python3 + - python3-kubernetes + - python3-invoke + - python3-fabric + - apt-transport-https + - ca-certificates + - curl + - containerd.io=1.7.* + - "kubelet={{ k8s_common_kubernetes_version }}*" + - "kubeadm={{ k8s_common_kubernetes_version }}*" + - "kubectl={{ k8s_common_kubernetes_version }}*" + - "helm={{ k8s_common_helm_version }}*" +- name: Copy containerd configuration file + ansible.builtin.copy: + src: files/etc/containerd/config.toml + dest: /etc/containerd/config.toml + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Reload service containerd + ansible.builtin.systemd: + name: containerd + state: restarted +- name: Enable service containerd + ansible.builtin.systemd: + name: containerd + enabled: true + masked: false +- name: Modify kernel module overlay + ansible.builtin.command: modprobe overlay + changed_when: false +- name: Modify kernel module br_netfilter + ansible.builtin.command: modprobe br_netfilter + changed_when: false +- name: Create file for list of kernel modules required by containerd + ansible.builtin.file: + path: "/etc/modules-load.d/containerd.conf" + state: "touch" + owner: root + group: root + mode: u=rw,g=r,o=r +- name: Populate list of kernel modules required by containerd + ansible.builtin.blockinfile: + path: "/etc/modules-load.d/containerd.conf" + block: | + overlay + br_netfilter diff --git a/ansible/roles/k8s-common/tasks/pv.yml b/ansible/roles/k8s-common/tasks/pv.yml new file mode 100644 index 000000000..6a11ee11e --- /dev/null +++ b/ansible/roles/k8s-common/tasks/pv.yml @@ -0,0 +1,28 @@ +- name: Create persistent directories in /orc2_data if it does not exist + ansible.builtin.file: + path: "/orc2_data/{{ item }}" + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx + loop: + - jupyterhub + - containerd + - repo2docker + - prometheus + - grafana + - alertmanager + +- name: Create persistent directories in /harbor/ if it does not exist + ansible.builtin.file: + path: "/harbor/{{ item }}" + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx + loop: + - jobservice + - registry + - redis + - trivy + - database diff --git a/ansible/roles/k8s-common/vars/main.yml b/ansible/roles/k8s-common/vars/main.yml new file mode 100644 index 000000000..6bfebd61c --- /dev/null +++ b/ansible/roles/k8s-common/vars/main.yml @@ -0,0 +1,2 @@ +k8s_common_kubernetes_version: "1.31" +k8s_common_helm_version: "3.16.1" diff --git a/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py b/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py new file mode 100644 index 000000000..67fbca9ab --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/cron/kill-after-timeout-pods.py @@ -0,0 +1,91 @@ +"""Kill pods in Kubernetes cluster after timeout""" + +import argparse +import datetime +import logging + +from kubernetes import client, config + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("kill-after-timeout-pods") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" +BINDER_TIME_OUT = 6 # hours + + +def get_timed_out_pods(): + """Get list of all timed out pods that are single user running pod""" + time_now = datetime.datetime.now(datetime.timezone.utc) + all_timed_out_pods = [] + + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + pod_run_time = time_now - pod.metadata.creation_timestamp + pod_run_time_in_hours = pod_run_time.total_seconds() / 3600 + logger.debug( + "Pod %s (%s) is running for %.1f hours.", + pod.metadata.name, + pod.status.phase, + pod_run_time_in_hours, + ) + if ( + pod.metadata.name.startswith("jupyter-") + and pod_run_time_in_hours > BINDER_TIME_OUT + ): + all_timed_out_pods.append(pod) + logger.debug("Pod %s added to the list.", pod.metadata.name) + + return all_timed_out_pods + + +def kill_pod(pod): + """Kill single pod""" + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info("Fail to delete pod %s due %s", pod.metadata.name, exception) + + +def kill_timed_out_pods(): + """Kill timed out pods""" + logger.info("Starting inspection of Kubernetes pod ...") + all_timed_out_pods = get_timed_out_pods() + for timed_out_pod in all_timed_out_pods: + kill_pod(timed_out_pod) + logger.info("%s pods deleted.", len(all_timed_out_pods)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Kill Timed Out Pods Cron Job", + description="Cron job to kill Kubernetes pods that timed out", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + kill_timed_out_pods() diff --git a/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py b/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py new file mode 100644 index 000000000..3b75368e2 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/cron/kill-succeeded-pods.py @@ -0,0 +1,77 @@ +"""Kill succeeded pods in Kubernetes cluster""" + +import argparse +import logging + +from kubernetes import client, config + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("kill-succeeded-pods") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" + + +def get_succeeded_pods(): + """Get list of all succeeded pods that are single user running pod""" + all_succeeded_pods = [] + + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is %s", pod.metadata.name, pod.status.phase) + if pod.status.phase == "Succeeded" and pod.metadata.name.startswith("jupyter-"): + all_succeeded_pods.append(pod) + + return all_succeeded_pods + + +def kill_pod(pod): + """Kill single pod""" + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info("Fail to delete pod %s due %s", pod.metadata.name, exception) + + +def kill_succeeded_pods(): + """Kill succeeded pods""" + logger.info("Starting inspection of Kubernetes pod ...") + all_succeeded_pods = get_succeeded_pods() + for succeeded_pod in all_succeeded_pods: + kill_pod(succeeded_pod) + logger.info("%s pods deleted.", len(all_succeeded_pods)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Kill Succeeded Pods Cron Job", + description="Cron job to kill Kubernetes pods in Succeeded status that are very old", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + kill_succeeded_pods() diff --git a/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service new file mode 100644 index 000000000..b3f183dd2 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-dind-bot.service @@ -0,0 +1,14 @@ +[Unit] +Description=Bot service to restart ORC2 Docker-in-Docker when is not working +After=kubelet.service +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always +RestartSec=1 +User=ansible +{% for host in hostvars %} +Environment="PASSWORD_{{ hostvars[host]['ansible_host'] | replace(".", "_") }}={{ hostvars[host]['ansible_become_pass'] }}" +{% endfor %} +ExecStart=/usr/bin/python3 /usr/bin/orc2-fix-dind-bot.py --verbose diff --git a/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service new file mode 100644 index 000000000..19c08eb99 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/etc/systemd/system/orc2-fix-jupyterhub-bot.service @@ -0,0 +1,11 @@ +[Unit] +Description=Bot service to restart ORC2 JupyterHub when API is not working +After=kubelet.service +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always +RestartSec=1 +User=ansible +ExecStart=/usr/bin/python3 /usr/bin/orc2-fix-jupyterhub-bot.py --verbose diff --git a/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py new file mode 100644 index 000000000..f6e749b30 --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-dind-bot.py @@ -0,0 +1,145 @@ +"""Script to identify when Docker-in-Docker stop working.""" + +import argparse +import datetime +import logging +import os + +from fabric import Connection +from invoke import Responder +from kubernetes import client, config, watch + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("orc2-fix-dind-bot") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" + + +def remove_docker_socket(host_IP): + """Remove Docker socket""" + ssh_password = os.getenv(f"PASSWORD_{host_IP.replace('.', '_')}") + + logger.info("Connecting to %s ...", host_IP) + c = Connection(host_IP, user="ansible", connect_kwargs={"password": ssh_password}) + logger.info("Connected!", host_IP) + + logger.info("Removing Docker socket ...") + sudopass = Responder( + pattern=r"\[sudo\] password for .*:", + response=f"{ssh_password}\n", + ) + c.run("sudo rm -rf /var/run/dind/docker.sock/", pty=True, watchers=[sudopass]) + logger.info("Removed Docker socket.") + + +def remove_pods(): + """Remove Docker-in-Docker related pods""" + logger.debug("Starting search for pods ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith( + "binderhub-dind-" + ) or pod.metadata.name.startswith("binderhub-image-cleaner-"): + logger.info("Found pod %s", pod.metadata.name) + pod_to_delete_name = pod.metadata.name + logger.info("Requesting delete of pod %s ...", pod_to_delete_name) + try: + api_response = v1.delete_namespaced_pod(pod_to_delete_name, NAMESPACE) + logger.info("Pod %s deleted.", pod_to_delete_name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod_to_delete_name, exception + ) + logger.debug("Completed search for pods!") + + +def get_node_running_pod(pod_name): + """Get node host's IP address running pod""" + pod_status = v1.read_namespaced_pod(pod_name, namespace=NAMESPACE) + logger.debug(pod_status) + host_IP = pod_status.status.host_ip + logger.info("%s is running on %s", pod_name, host_IP) + return host_IP + + +def monitor_cluster(): + """Monitor pod""" + while True: + logger.info("Start monitoring ...") + + w = watch.Watch() + for event in w.stream(v1.list_namespaced_event, namespace=NAMESPACE): + pod_name = event["object"].involved_object.name + if pod_name.startswith("binderhub-dind-"): + if event["object"].type == "Warning": + logger.info("Found Warning event in %s", pod_name) + if event["object"].reason == "BackOff": + time_since_last_timestamp = ( + datetime.datetime.now(datetime.timezone.utc) + - event["object"].last_timestamp + ) + + if time_since_last_timestamp.seconds > 5: + logger.info( + "Skipping because event old (%d > 5 seconds).", + time_since_last_timestamp.seconds, + ) + else: + logger.info("Removing Docker-in-Docker socket and pods ...") + try: + node_IP_address = get_node_running_pod(pod_name) + remove_docker_socket(node_IP_address) + remove_pods() + except Exception as exception: + logger.info( + "Fail to delete pod %s due %s", pod_name, exception + ) + + elif event["object"].type == "Normal": + logger.debug( + "Found Normal event in %s ... skipping!", + event["object"].metadata.name, + ) + else: + logger.debug( + "Found %s event in %s ... ignoring!", + event["object"].type, + ["object"].metadata.name, + ) + + logger.info("Stop monitoring!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Fix JupyterHub Bot", + description="Monitoring Kubernetes cluster to restart JupyterHub", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + monitor_cluster() diff --git a/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py new file mode 100644 index 000000000..97c9e22fd --- /dev/null +++ b/ansible/roles/k8s-control-panel/files/usr/bin/orc2-fix-jupyterhub-bot.py @@ -0,0 +1,111 @@ +"""Script to identify when JupyterHub stop working.""" + +import argparse +import datetime +import logging + +from kubernetes import client, config, watch + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +logger = logging.getLogger("orc2-fix-jupyterhub-bot") +logger.setLevel(logging.WARNING) + +NAMESPACE = "gesis" +RESTART_WAITING_TIME = 120 # seconds + + +def get_binder_pod(): + """Get name of pod running Binder.""" + logger.debug("Starting search for BinderHub pod ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith("binder-"): + logger.info("Found BinderHub pod: %s", pod.metadata.name) + binder_pod_name = pod.metadata.name + break + + logger.debug("Search for BinderHub pod stop.") + return binder_pod_name + + +def kill_jupyterhub_pod(): + """Kill all JupyterHub pods""" + logger.debug("Starting search for JupyterHub pod ...") + api_response = v1.list_namespaced_pod(NAMESPACE) + for pod in api_response.items: + logger.debug("Pod %s is running on the cluster", pod.metadata.name) + if pod.metadata.name.startswith("hub-"): + logger.info("Found JupyterHub pod: %s", pod.metadata.name) + logger.info("Requesting delete of pod %s ...", pod.metadata.name) + try: + api_response = v1.delete_namespaced_pod(pod.metadata.name, NAMESPACE) + logger.info("Pod %s deleted.", api_response.metadata.name) + except client.exceptions.ApiException as exception: + logger.info( + "Fail to delete pod %s due %s", pod.metadata.name, exception + ) + logger.debug("Search for JupyterHub pod stop.") + + +def monitor_pod(): + """Monitor pod""" + while True: + pod_name = get_binder_pod() + logger.info("Monitoring %s", pod_name) + + last_jupyterhub_restart = datetime.datetime.now(datetime.timezone.utc) + + w = watch.Watch() + for line in w.stream( + v1.read_namespaced_pod_log, name=pod_name, namespace=NAMESPACE, tail_lines=0 + ): + if line.find("Error accessing Hub API") > -1: + logger.debug(line) + + now = datetime.datetime.now(datetime.timezone.utc) + time_difference = now - last_jupyterhub_restart + if time_difference.seconds > RESTART_WAITING_TIME: + logger.info("Restarting JupyterHub ...") + kill_jupyterhub_pod() + last_jupyterhub_restart = now + else: + logger.info( + "Waiting %s seconds for JupyterHub to restart.", + RESTART_WAITING_TIME, + ) + + logger.info("Stop monitoring %s", pod_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Open Research Computing v2 Fix JupyterHub Bot", + description="Monitoring Kubernetes cluster to restart JupyterHub", + ) + parser.add_argument( + "-c", + "--kube-config", + type=str, + default="~/.kube/config", + help="Location of Kubernetes configuration file", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Display log information" + ) + parser.add_argument( + "-vv", "--debug", action="store_true", help="Display debug information" + ) + args = parser.parse_args() + if args.verbose: + logger.setLevel(logging.INFO) + if args.debug: + logger.setLevel(logging.DEBUG) + + config.load_kube_config(config_file=args.kube_config) + + v1 = client.CoreV1Api() + + monitor_pod() diff --git a/ansible/roles/k8s-control-panel/tasks/cni.yml b/ansible/roles/k8s-control-panel/tasks/cni.yml new file mode 100644 index 000000000..fd52c2458 --- /dev/null +++ b/ansible/roles/k8s-control-panel/tasks/cni.yml @@ -0,0 +1,17 @@ +- name: Remove Container Network Interface (CNI) Flannel + kubernetes.core.k8s: + state: absent + src: https://github.com/coreos/flannel/raw/master/Documentation/kube-flannel.yml +- name: Install Container Network Interface (CNI) Tigera Calico operator + kubernetes.core.k8s: + state: present + src: https://raw.githubusercontent.com/projectcalico/calico/v{{ k8s_control_panel_calico_version }}/manifests/tigera-operator.yaml +- name: Install Calico and resource + kubernetes.core.k8s: + state: present + # A local copy of https://raw.githubusercontent.com/projectcalico/calico/v3.28.2/manifests/custom-resources.yaml + definition: "{{ lookup('ansible.builtin.template', '{{ role_path }}/templates/calico/custom-resources.yaml.jinja') | from_yaml_all }}" +- name: Install Cert Manager + kubernetes.core.k8s: + state: present + src: https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.crds.yaml diff --git a/ansible/roles/k8s-control-panel/tasks/ingress-nginx.yml b/ansible/roles/k8s-control-panel/tasks/ingress-nginx.yml new file mode 100644 index 000000000..8d0cb940c --- /dev/null +++ b/ansible/roles/k8s-control-panel/tasks/ingress-nginx.yml @@ -0,0 +1,21 @@ +- name: Add a Ingress NGINX Controller Helm repository + kubernetes.core.helm_repository: + repo_name: ingress-nginx + repo_url: https://kubernetes.github.io/ingress-nginx + force_update: true +- name: Create Ingress NGINX Controller Kubernetes namespace + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Namespace + metadata: + name: ingress-nginx +- name: Deploy Ingress NGINX Controller + kubernetes.core.helm: + release_name: ingress-nginx + release_namespace: ingress-nginx + chart_ref: ingress-nginx/ingress-nginx + chart_version: '{{ k8s_control_panel_ingress_nginx_version }}' + create_namespace: false + history_max: 3 diff --git a/ansible/roles/k8s-control-panel/tasks/main.yml b/ansible/roles/k8s-control-panel/tasks/main.yml new file mode 100644 index 000000000..ce4fc1fba --- /dev/null +++ b/ansible/roles/k8s-control-panel/tasks/main.yml @@ -0,0 +1,172 @@ +- name: Check if Kubernetes is running + ansible.builtin.shell: > + kubectl get nodes || /bin/true + changed_when: false + register: kubernetes_nodes +- name: Pull kubernetes images + when: kubernetes_nodes.stdout.find('control-plane') == -1 + ansible.builtin.shell: > + kubeadm config images pull + --cri-socket unix:///run/containerd/containerd.sock + changed_when: false +- name: Initialize the cluster + when: kubernetes_nodes.stdout.find('control-plane') == -1 + ansible.builtin.shell: > + kubeadm init + --pod-network-cidr=10.244.0.0/16 + --upload-certs + --control-plane-endpoint={{ K8S_CONTROL_PLANE_ENDPOINT }} + --cri-socket unix:///run/containerd/containerd.sock + changed_when: false + register: kubeadm_init_output +- name: Create root's .kube directory + ansible.builtin.file: + path: /root/.kube + state: directory + owner: root + group: root + mode: u=rwx,g=rx,o=rx +- name: Copies admin.conf to root's kube config + ansible.builtin.copy: + src: /etc/kubernetes/admin.conf + dest: /root/.kube/config + remote_src: true + owner: root + group: root + mode: u=rw,g=r,o= +- name: Create user's .kube directory + ansible.builtin.file: + path: /home/ansible/.kube + state: directory + mode: u=rwx,g=x,o=x + owner: ansible + group: ansible +- name: Copies admin.conf to user's kube config + ansible.builtin.copy: + src: /etc/kubernetes/admin.conf + dest: /home/ansible/.kube/config + remote_src: true + owner: ansible + group: ansible + mode: u=rw,g=,o= +- name: Get the token for joining the worker nodes + ansible.builtin.shell: > + kubeadm token create --print-join-command + changed_when: false + register: kubernetes_join_command +- name: Create temporary file + ansible.builtin.file: + path: /tmp/kubernetes_join_command + state: touch + owner: ansible + group: ansible + mode: u=rw,g=r,o= +- name: Save content of join command + ansible.builtin.copy: + content: | + #!/bin/sh + {{ kubernetes_join_command.stdout }} + dest: /tmp/kubernetes_join_command + owner: ansible + group: ansible + mode: u=rw,g=r,o= +- name: Copy join command to local file + ansible.builtin.fetch: + src: /tmp/kubernetes_join_command + dest: "{{ ANSIBLE_CONTROL_NODE_TMP }}" +- name: Add Container Network Interface (CNI) to Kubernetes cluster + ansible.builtin.import_tasks: + file: cni.yml +- name: Add GitLab Helm repository + kubernetes.core.helm_repository: + name: gitlab + repo_url: https://charts.gitlab.io +- name: Deploy GitLab agent + kubernetes.core.helm: + name: "gitlab-agent-{{ inventory_file | basename }}" + chart_ref: gitlab/gitlab-agent + release_namespace: gitlab-agent + dependency_update: true + create_namespace: true + set_values: + - value: "config.token={{ GITLAB_K8S_TOKEN }}" + - value: config.kasAddress=wss://git.gesis.org/-/kubernetes-agent/ +- name: Copy orc2-fix-jupyterhub-bot Python script + ansible.builtin.copy: + src: files/usr/bin/orc2-fix-jupyterhub-bot.py + dest: /usr/bin/orc2-fix-jupyterhub-bot.py + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Copy orc2-fix-jupyterhub-bot Systemd Unit script + ansible.builtin.copy: + src: files/etc/systemd/system/orc2-fix-jupyterhub-bot.service + dest: /etc/systemd/system/orc2-fix-jupyterhub-bot.service + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Enable service orc2-fix-jupyterhub-bot + ansible.builtin.systemd: + name: orc2-fix-jupyterhub-bot + daemon_reload: true + enabled: true + masked: false + state: restarted +- name: Copy orc2-fix-dind-bot Python script + ansible.builtin.copy: + src: files/usr/bin/orc2-fix-dind-bot.py + dest: /usr/bin/orc2-fix-dind-bot.py + owner: root + group: root + mode: u=rwx,g=rwx,o=rx +- name: Copy orc2-fix-dind-bot Systemd Unit script + ansible.builtin.template: + src: files/etc/systemd/system/orc2-fix-dind-bot.service + dest: /etc/systemd/system/orc2-fix-dind-bot.service + owner: root + group: root + mode: u=rwx,g=rwx,o=r +- name: Enable service orc2-fix-dind-bot + ansible.builtin.systemd: + name: orc2-fix-dind-bot + daemon_reload: true + enabled: true + masked: false + state: restarted +- name: Create directory + ansible.builtin.file: + state: directory + path: /home/ansible/bin + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=rx +- name: Copy kill-succeeded-pods.py + ansible.builtin.copy: + src: files/cron/kill-succeeded-pods.py + dest: /home/ansible/bin/kill-succeeded-pods.py + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=r +- name: Add cron job to remove succeeded pods + ansible.builtin.cron: + name: "remove succeeded" + job: "python3 /home/ansible/bin/kill-succeeded-pods.py --verbose >> /home/ansible/kill-succeeded-pods.log 2>&1" + minute: "*/5" +- name: Copy kill-after-timeout-pods.py + ansible.builtin.copy: + src: files/cron/kill-after-timeout-pods.py + dest: /home/ansible/bin/kill-after-timeout-pods.py + owner: ansible + group: ansible + mode: u=rwx,g=rwx,o=r +- name: Add cron job to remove timed out pods + ansible.builtin.cron: + name: "remove timeout" + job: "python3 /home/ansible/bin/kill-after-timeout-pods.py --verbose >> /home/ansible/kill-after-timeout-pods.log 2>&1" + minute: "*/5" +- name: Add MetalLB to Kubernetes cluster + ansible.builtin.import_tasks: + file: metallb.yml +- name: Add Ingress NGINX Controller to Kubernetes cluster + ansible.builtin.import_tasks: + file: ingress-nginx.yml diff --git a/ansible/roles/k8s-control-panel/tasks/metallb.yml b/ansible/roles/k8s-control-panel/tasks/metallb.yml new file mode 100644 index 000000000..b7b496cb8 --- /dev/null +++ b/ansible/roles/k8s-control-panel/tasks/metallb.yml @@ -0,0 +1,52 @@ +- name: Add a MetalLB Helm repository + kubernetes.core.helm_repository: + repo_name: metallb + repo_url: https://metallb.github.io/metallb + force_update: true +- name: Create MetalLB Kubernetes namespace + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Namespace + metadata: + name: metallb + labels: + # Required labels + # https://metallb.universe.tf/installation/#installation-with-helm + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/warn: privileged +- name: Deploy MetalLB + kubernetes.core.helm: + release_name: metallb + release_namespace: metallb + chart_ref: metallb/metallb + chart_version: '{{ k8s_control_panel_metallb_version }}' + create_namespace: false + history_max: 3 +- name: Create MetalLB Kubernetes IP Address Pool + kubernetes.core.k8s: + state: present + definition: + apiVersion: metallb.io/v1beta1 + kind: IPAddressPool + metadata: + name: "{{ k8s_control_panel_metallb_ip_address_pool_name }}" + namespace: metallb + spec: + addresses: + # TODO Use Jinja filter to automate this. + - "{{ k8s_control_panel_addresses_begin }}-{{ k8s_control_panel_addresses_end }}" +- name: Configure L2 Advertisement for MetalLB + kubernetes.core.k8s: + state: present + definition: + apiVersion: metallb.io/v1beta1 + kind: L2Advertisement + metadata: + name: "{{ k8s_control_panel_metallb_ip_address_pool_name }}-l2-advertisement" + namespace: metallb + spec: + ipAddressPools: + - "{{ k8s_control_panel_metallb_ip_address_pool_name }}" diff --git a/ansible/roles/k8s-control-panel/templates/calico/custom-resources.yaml.jinja b/ansible/roles/k8s-control-panel/templates/calico/custom-resources.yaml.jinja new file mode 100644 index 000000000..df37ae858 --- /dev/null +++ b/ansible/roles/k8s-control-panel/templates/calico/custom-resources.yaml.jinja @@ -0,0 +1,26 @@ +# This section includes base Calico installation configuration. +# For more information, see: https://docs.tigera.io/calico/latest/reference/installation/api#operator.tigera.io/v1.Installation +apiVersion: operator.tigera.io/v1 +kind: Installation +metadata: + name: default +spec: + # Configures Calico networking. + calicoNetwork: + ipPools: + - name: default-ipv4-ippool + blockSize: 26 + cidr: '{{ k8s_control_panel_cidr }}' + encapsulation: VXLANCrossSubnet + natOutgoing: Enabled + nodeSelector: all() + +--- + +# This section configures the Calico API server. +# For more information, see: https://docs.tigera.io/calico/latest/reference/installation/api#operator.tigera.io/v1.APIServer +apiVersion: operator.tigera.io/v1 +kind: APIServer +metadata: + name: default +spec: {} diff --git a/ansible/roles/k8s-control-panel/vars/main.yml b/ansible/roles/k8s-control-panel/vars/main.yml new file mode 100644 index 000000000..31cfa853d --- /dev/null +++ b/ansible/roles/k8s-control-panel/vars/main.yml @@ -0,0 +1,7 @@ +k8s_control_panel_calico_version: "3.28.2" +k8s_control_panel_cidr: "10.244.0.0/16" +k8s_control_panel_metallb_version: 0.14.9 +k8s_control_panel_metallb_ip_address_pool_name: "gesis" +k8s_control_panel_ingress_nginx_version: 4.12.0 +k8s_control_panel_addresses_begin: 0.0.0.0 +k8s_control_panel_addresses_end: 0.0.0.0 diff --git a/ansible/roles/k8s-pv/tasks/grafana.yml b/ansible/roles/k8s-pv/tasks/grafana.yml new file mode 100644 index 000000000..e983576ad --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/grafana.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for Grafana + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: grafana + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: grafana + spec: + capacity: + storage: "{{ GRAFANA_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/grafana + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: grafana + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml b/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml new file mode 100644 index 000000000..55bcab192 --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/jupyter-hub-db.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for JupyterHub + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "jupyterhub-db" + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: jupyterhub + spec: + capacity: + storage: "{{ JUPYTERHUB_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/jupyterhub + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: jupyterhub + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-pv/tasks/main.yml b/ansible/roles/k8s-pv/tasks/main.yml new file mode 100644 index 000000000..e83a385b4 --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/main.yml @@ -0,0 +1,22 @@ +- name: Create a Persistent Volume for Prometheus + kubernetes.core.k8s: + state: present + definition: + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: local-storage + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: mybinder + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer +- name: Provide Persistent Volume for Grafana + ansible.builtin.import_tasks: + file: grafana.yml +- name: Provide Persistent Volume for JupyterHub + ansible.builtin.import_tasks: + file: jupyter-hub-db.yml +- name: Provide Persistent Volume for Prometheus + ansible.builtin.import_tasks: + file: prometheus.yml diff --git a/ansible/roles/k8s-pv/tasks/prometheus.yml b/ansible/roles/k8s-pv/tasks/prometheus.yml new file mode 100644 index 000000000..ca4aa8a8a --- /dev/null +++ b/ansible/roles/k8s-pv/tasks/prometheus.yml @@ -0,0 +1,29 @@ +- name: Create a Persistent Volume for Prometheus + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: prometheus + labels: + app.kubernetes.io/managed-by: Ansible + app.kubernetes.io/part-of: prometheus + spec: + capacity: + storage: "{{ PROMETHEUS_CAPACITY_STORAGE }}" + volumeMode: Filesystem + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "local-storage" + local: + path: /orc2_data/prometheus + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: prometheus + operator: In + values: + - "true" diff --git a/ansible/roles/k8s-worker/tasks/main.yml b/ansible/roles/k8s-worker/tasks/main.yml new file mode 100644 index 000000000..4fb307a5e --- /dev/null +++ b/ansible/roles/k8s-worker/tasks/main.yml @@ -0,0 +1,12 @@ +- name: Copy join command + ansible.builtin.copy: + src: "{{ ANSIBLE_CONTROL_NODE_TMP }}/{{ K8S_CONTROL_PLANE_ALIAS }}/tmp/kubernetes_join_command" + dest: /tmp/kubernetes_join_command + mode: u=rwx,g=rx,o=rx +- name: Attempt to join cluster + ansible.builtin.command: /tmp/kubernetes_join_command + register: kubernetes_join_attempt + failed_when: + - kubernetes_join_attempt.rc != 0 + - '"already exists" not in kubernetes_join_attempt.stderr' + changed_when: false diff --git a/ansible/roles/mybinder/tasks/main.yml b/ansible/roles/mybinder/tasks/main.yml new file mode 100644 index 000000000..720cb1e31 --- /dev/null +++ b/ansible/roles/mybinder/tasks/main.yml @@ -0,0 +1,22 @@ +- name: Add hub.jupyter.org/node-purpose label + ansible.builtin.shell: | + {% for host in hostvars %} + {% if host in groups['binderhub'] %} + kubectl label nodes {{ host }} hub.jupyter.org/node-purpose=core + {% else %} + kubectl label nodes {{ host }} hub.jupyter.org/node-purpose- + {% endif %} + {% endfor %} + changed_when: false +- name: Add labels from inventory + ansible.builtin.shell: | + {% for host in hostvars %} + {% for group, host_list in groups.items() %} + {% if host in host_list %} + kubectl label nodes {{ host }} {{ group }}=true + {% else %} + kubectl label nodes {{ host }} {{ group }}- + {% endif %} + {% endfor %} + {% endfor %} + changed_when: false diff --git a/ansible/vault/gesis-acceptance.yml b/ansible/vault/gesis-acceptance.yml new file mode 100644 index 000000000..b582c8376 --- /dev/null +++ b/ansible/vault/gesis-acceptance.yml @@ -0,0 +1,25 @@ +$ANSIBLE_VAULT;1.1;AES256 +33363162666433643064343861356331353636383861613134396366653734346162663739316437 +3036656631303962643932353761343133316231313138330a396438633163613839323534376262 +65346530316334633337646632343161343538393566626164353332353733363135376363343836 +6634333333346638320a646630313964303832616531383830373738383734646536383630613062 +34333138303366353366633465316435363435373466366532363339356135653231366163336165 +37366536633934316433656532633837636635303838656339643638383263656461303161643037 +37313133376231613963613938313866353231343833383161356333333035613231623838616461 +36376563353138636632326638626239646564313363623565396564303066336564656235343935 +65643634646138663134316439653065656662363635666466333937323337366564653937616136 +33623963386336353231386135623866616632366365663036646439623331666535306362353562 +34613439666232616138393237323635363432353165346665383037303032636235643735356164 +32643363613137356337396563366534353139343237396539616365656439323966326434653864 +64656662646234363734626233313235643338356336333038323134393439626564656538613363 +38633735616437396364663565316263303438336636313539623630633066633636326333666530 +33663330343036613566346164396636323965666132663361343133666138623938383831613934 +32386562656531393162626435333639643232326265666562346537346334313961393761366234 +37396231343665363533356339636435313934636464353130386566376330356461613531306237 +30333339343637663962303264653732346562303131616364386565383262343936313162393766 +66333333653339633766316536356661333062656162316131346436613062333132623332363537 +63653736363362363562646265383537613366643535346665316432336266373132306463376436 +65656438353361373132393766356639616435623766333437636635383263313938633935393961 +30323963336563366135316565613137383136333964366437636633346237366130393537616662 +36313965623130666237393337366431616635653235623961623432646664643032323038393430 +62316439383963643964 diff --git a/ansible/vault/gesis-production.yml b/ansible/vault/gesis-production.yml new file mode 100644 index 000000000..0576d46f5 --- /dev/null +++ b/ansible/vault/gesis-production.yml @@ -0,0 +1,18 @@ +$ANSIBLE_VAULT;1.1;AES256 +32626233376562376639323233666538613863613765326261366535656434663931306235623132 +3561333630333337376461663662663165396630303962310a386331373832366237653436643836 +38666333643435393864666135303731663732343030336561656631663861303338613461343561 +3132653334336139610a336364343431376537316532626332646438656334646331663330646632 +33313237303330346462616562313564623732653435313365333166376162313061656131626536 +31356434663062626633616234393165323632376231656161303563633436396230363533643130 +62653435623037633461623134393132383833306563313938323338633232633363376466393064 +30636134346636616533333935663565336134303063646332633863626230616662643431656539 +30353664633961633263333435336232663538393431316662353336666365373066323066633131 +63613562663466343865306532333565363362386235643962343234613562303164303638623365 +63386635316531356238326364376334663934316661336537663561623664306133356134363661 +32353133333736613063363130303761363966653562613631623436333236366334303030303938 +63613238656662343037373932333933396538376565646434316530616461303032326263646161 +35353337343065343465666538346531633164623932393935316666326337303133613134373835 +33323561663337313230656136376561373665306161353338373333333134313464343266373365 +64333130393738656331666165383963613139613766363732306230393764623866653330373764 +6537 diff --git a/config/gesis-stage.yaml b/config/gesis-stage.yaml new file mode 100644 index 000000000..638c366d8 --- /dev/null +++ b/config/gesis-stage.yaml @@ -0,0 +1,169 @@ +analyticsPublisher: + enabled: false +binderhub: + nodeSelector: + ingress: "true" + config: + BinderHub: + base_url: /binder/ + build_node_selector: + binderhub: "true" + hub_url: https://notebooks-test.gesis.org/binder/jupyter/ + image_prefix: gesiscss/binder-r2d-g5b5b759- + template_path: /etc/binderhub/templates + use_registry: true + KubernetesBuildExecutor: + memory_limit: 3G + memory_request: 1G + node_selector: + binderhub: "true" + docker_available: "true" + LaunchQuota: + total_quota: 30 + extraConfig: + 01-template-variables: > + template_vars = { + "gesis_notebooks_https": 'https://notebooks-test.gesis.org/', + 'production': False, + } + + template_vars['gesis_notebooks_static'] = + template_vars['gesis_notebooks_https'] + "static/" + + template_vars['gesis_web_frontend_framework'] = + template_vars['gesis_notebooks_static'] + "gesis-web-frontend-framework/" + + template_vars['binder_static'] = template_vars['gesis_notebooks_https'] + + "binder/static/" + + c.BinderHub.template_variables.update(template_vars) + 02-badge-base-url: | + c.BinderHub.badge_base_url = "https://mybinder.org/" + extraEnv: + GOOGLE_APPLICATION_CREDENTIALS: /secrets/service-account.json + extraVolumeMounts: + - mountPath: /secrets + name: secrets + readOnly: true + extraVolumes: + - name: secrets + secret: + secretName: events-archiver-secrets + imageCleaner: + enabled: true + imageGCThresholdHigh: 80e9 + imageGCThresholdLow: 50e9 + imageGCThresholdType: absolute + ingress: + hosts: + - notebooks-test.gesis.org + jupyterhub: + hub: + baseUrl: /jupyterhub + db: + pvc: + storageClassName: local-storage + nodeSelector: + jupyterhub: "true" + singleuser: + nodeSelector: + jupyterhub_single_user: "true" + ingress: + hosts: + - notebooks-test.gesis.org + replicas: 1 +cryptnono: + enabled: true +grafana: + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - disableDeletion: true + editable: false + folder: notebooks.gesis.org + name: default + options: + path: /var/lib/grafana/dashboards/notebooks.gesis.org + orgId: 1 + type: file + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - editable: false + isDefault: true + name: GESIS Notebooks Prometheus + orgId: 1 + type: prometheus + uid: gesis-notebooks-prometheus + url: http://binderhub-prometheus-server + prune: true + deploymentStrategy: + type: Recreate + enabled: true + grafana.ini: + auth.anonymous: + enabled: true + org_name: Main Org. + org_role: Viewer + auth.basic: + enabled: true + security: + allow_embedding: true + server: + http_port: 3000 + root_url: https://notebooks.gesis.org/grafana/ + smtp: + enabled: true + ingress: + hosts: + - notebooks-test.gesis.org + path: /grafana + nodeSelector: + grafana: "true" + persistence: + enabled: false + resources: + limits: + cpu: "0.25" + memory: 128Mi + requests: + cpu: "0" + memory: 128Mi +ingress-nginx: + controller: + replicaCount: 1 + nodeSelector: + ingress: "true" + hostPort: + enable: true + scope: + enabled: true + service: + externalTrafficPolicy: null + type: LoadBalancer +prometheus: + enabled: true + server: + ingress: + hosts: + - notebooks-test.gesis.org + path: /prometheus + livenessProbeInitialDelay: 800 + persistentVolume: + size: 10Gi + storageClass: local-storage + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi + retention: 30d +static: + ingress: + hosts: + - static.notebooks-test.gesis.org +url: https://notebooks-test.gesis.org/binder/ diff --git a/docs/.gitattributes b/docs/.gitattributes new file mode 100644 index 000000000..07fe41c52 --- /dev/null +++ b/docs/.gitattributes @@ -0,0 +1,2 @@ +# GitHub syntax highlighting +pixi.lock linguist-language=YAML linguist-generated=true diff --git a/docs/source/deployment/gesis-diagram.svg b/docs/source/deployment/gesis-diagram.svg new file mode 100755 index 000000000..f27e6965f --- /dev/null +++ b/docs/source/deployment/gesis-diagram.svg @@ -0,0 +1,3 @@ + + +notebooks.gesis.orgGoogle CloudGESIS GitLabGitHub ActionsGitHubnotebooks.gesis.orgGoogle CloudGESIS GitLabGitHub ActionsGitHubDevelopergit commitgit pushtriggervalidationhelm upgradetriggervalidationhelm upgradeDeveloper \ No newline at end of file diff --git a/docs/source/deployment/gesis-load-balancer.drawio.svg b/docs/source/deployment/gesis-load-balancer.drawio.svg new file mode 100755 index 000000000..7d60733f8 --- /dev/null +++ b/docs/source/deployment/gesis-load-balancer.drawio.svg @@ -0,0 +1,4 @@ + + + +
User
GESIS on-premise physical server
Virtual Private Server 1
Kubernetes Node
Virtual Private Server 2
Kubernetes Node
Ingress NGINX
Controller
mybinder.org
\ No newline at end of file diff --git a/docs/source/deployment/gesis.md b/docs/source/deployment/gesis.md new file mode 100644 index 000000000..039ccb9ea --- /dev/null +++ b/docs/source/deployment/gesis.md @@ -0,0 +1,40 @@ +# How to deploy a change to notebooks.gesis.org? + +[GESIS Leibniz Institute for the Social Sciences](https://www.gesis.org) is a member of the [mybinder.org federation](https://mybinder.readthedocs.io/en/latest/about/status.html). GESIS has on-premise servers and use it for the mybinder.org server. The use of on-premise servers requires a separate deployment because the access to the servers using SSH requires the tunelling using a VPN. + + + +![Sequence diagram illustrating the deployment.](./gesis-diagram.svg) + +## GESIS GitLab CI/CD Server + +GESIS GitLab server runs [GitLab Community Edition v16.11.6](https://gitlab.com/gitlab-org/gitlab-foss/-/tags/v16.11.6) with [continuous integration (CI) and continuous delivery (CD)](https://about.gitlab.com/topics/ci-cd/) enable. + +The CI/CD jobs are defined in [`.gitlab-ci.yml`](https://github.com/jupyterhub/mybinder.org-deploy/tree/main/.gitlab-ci.yml). + +## Kubernetes on bare metal + +Cloud environments provide a load balancer to the Kubernetes clusters. Unfortunately, Kubernetes cluster does not includes a default implementation of a load balancer for the scenario that it is running on bare metal. Because of this, the deployment of mybinder.org to GESIS servers must include the configuration of a load balancer. We are using [MetalLB](https://metallb.universe.tf/) with [Ingress NGINX Controller](https://kubernetes.github.io/ingress-nginx/). + +![Sequence diagram illustrating the load balancer.](./gesis-load-balancer.drawio.svg) + +## Virtual Private Server configuration with Ansible + +We use [Ansible](https://www.ansible.com/) to automate the configuration of the virtual private server (VPS) provided by GESIS. After a successful configuration, we will have a operational Kubernetes cluster to deploy mybinder.org. diff --git a/docs/source/deployment/index.rst b/docs/source/deployment/index.rst index 5ab6e4f25..b9a7224f0 100644 --- a/docs/source/deployment/index.rst +++ b/docs/source/deployment/index.rst @@ -8,3 +8,4 @@ Deployment and Operation prereqs how what + gesis diff --git a/mybinder/templates/minesweeper/configmap.yaml b/mybinder/templates/minesweeper/configmap.yaml index a083cb0b7..41fd2a95f 100644 --- a/mybinder/templates/minesweeper/configmap.yaml +++ b/mybinder/templates/minesweeper/configmap.yaml @@ -1,4 +1,6 @@ -{{- /* configmap for minesweeper source files */ -}} +{{- /* +configmap for minesweeper source files +*/}} kind: ConfigMap apiVersion: v1 metadata: @@ -12,7 +14,9 @@ data: {{- (.Files.Glob "files/minesweeper/*").AsConfig | nindent 2 }} {{- (.Files.Glob "files/minesweeper/secrets/*").AsConfig | nindent 2 }} --- -{{- /* configmap for minesweeper configuration from values */ -}} +{{- /* +configmap for minesweeper configuration from values +*/}} kind: ConfigMap apiVersion: v1 metadata: diff --git a/secrets/config/common/gesis.yaml b/secrets/config/common/gesis.yaml new file mode 100644 index 000000000..a62d2ecc6 Binary files /dev/null and b/secrets/config/common/gesis.yaml differ diff --git a/secrets/config/gesis-stage.yaml b/secrets/config/gesis-stage.yaml new file mode 100644 index 000000000..7e5795906 Binary files /dev/null and b/secrets/config/gesis-stage.yaml differ