diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index db15721..7ca4c39 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -3,9 +3,8 @@ on: push: paths: - .github/workflows/build-containers.yml - - Dockerfile - - docker-entrypoint.sh - workflow_dispatch: + - image/** + workflow_dispatch: jobs: build_push_api: @@ -49,6 +48,7 @@ jobs: with: provenance: false push: true + context: image/ tags: ${{ steps.image-meta.outputs.tags }} labels: ${{ steps.image-meta.outputs.labels }} cache-from: type=local,src=/tmp/.buildx-cache diff --git a/.github/workflows/publish-helm-chart.yml b/.github/workflows/publish-helm-chart.yml index 7ad4374..516e388 100644 --- a/.github/workflows/publish-helm-chart.yml +++ b/.github/workflows/publish-helm-chart.yml @@ -1,37 +1,26 @@ -name: Release Charts - -on: - push: - branches: - - master - +name: Publish charts +# Run the tasks on every push +on: push jobs: - release: - # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions - # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write + publish_charts: + name: Build and push Helm charts runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 + - name: Check out the repository + uses: actions/checkout@v2 with: + # This is important for the semver action to work correctly + # when determining the number of commits since the last tag fetch-depth: 0 + submodules: true - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v3 - env: - GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + - name: Get SemVer version for current commit + id: semver + uses: stackhpc/github-actions/semver@master - - name: Run chart-releaser - uses: helm/chart-releaser-action@v1.5.0 + - name: Publish Helm charts + uses: stackhpc/github-actions/helm-publish@master with: - charts_dir: . - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - + token: ${{ secrets.GITHUB_TOKEN }} + version: ${{ steps.semver.outputs.version }} + app-version: ${{ steps.semver.outputs.short-sha }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/README.md b/README.md index a23317e..c0b7d61 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # Slurm Docker Cluster -This is a multi-container Slurm cluster using Kubernetes. The Helm chart -creates a named volume for persistent storage of MySQL data files as well as -an NFS volume for shared storage. +This is a multi-container Slurm cluster using Kubernetes. The Slurm cluster Helm chart creates a named volume for persistent storage of MySQL data files. By default, it also installs the +RookNFS Helm chart (also in this repo) to provide shared storage across the Slurm cluster nodes. ## Dependencies @@ -27,12 +26,11 @@ The Helm chart will create the following named volumes: * var_lib_mysql ( -> /var/lib/mysql ) -A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the scripts in the `/nfs` directory (See "Deploying the Cluster") +A named ReadWriteMany (RWX) volume mounted to `/home` is also expected, this can be external or can be deployed using the provided `rooknfs` chart directory (See "Deploying the Cluster"). ## Configuring the Cluster -All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). -Additional parameters can be found in the `values.yaml` file, which will be applied on a Helm chart deployment. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). +All config files in `slurm-cluster-chart/files` will be mounted into the container to configure their respective services on startup. Note that changes to these files will not all be propagated to existing deployments (see "Reconfiguring the Cluster"). Additional parameters can be found in the `values.yaml` file for the Helm chart. Note that some of these values will also not propagate until the cluster is restarted (see "Reconfiguring the Cluster"). ## Deploying the Cluster @@ -40,27 +38,26 @@ Additional parameters can be found in the `values.yaml` file, which will be appl On initial deployment ONLY, run ```console -./generate-secrets.sh +./generate-secrets.sh [] ``` -This generates a set of secrets. If these need to be regenerated, see "Reconfiguring the Cluster" +This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" Be sure to take note of the Open Ondemand credentials, you will need them to access the cluster through a browser ### Connecting RWX Volume -A ReadWriteMany (RWX) volume is required, if a named volume exists, set `nfs.claimName` in the `values.yaml` file to its name. If not, manifests to deploy a Rook NFS volume are provided in the `/nfs` directory. You can deploy this by running -```console -/nfs/deploy-nfs.sh -``` -and leaving `nfs.claimName` as the provided value. +A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. + +See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. ### Supplying Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console -./publish-keys.sh +./publish-keys.sh [] ``` +where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. ### Deploying with Helm @@ -68,6 +65,12 @@ After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the c ```console helm install slurm-cluster-chart ``` + +NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart +```console +helm dependency update slurm-cluster-chart +``` + Subsequent releases can be deployed using: ```console @@ -130,15 +133,33 @@ srun singularity exec docker://ghcr.io/stackhpc/mpitests-container:${MPI_CONTAIN ``` Note: The mpirun script assumes you are running as user 'rocky'. If you are running as root, you will need to include the --allow-run-as-root argument + ## Reconfiguring the Cluster ### Changes to config files -To guarantee changes to config files are propagated to the cluster, use +Changes to the Slurm configuration in `slurm-cluster-chart/files/slurm.conf` will be propagated (it may take a few seconds) to `/etc/slurm/slurm.conf` for all pods except the `slurmdbd` pod by running + ```console -kubectl rollout restart deployment +helm upgrade slurm-cluster-chart/ ``` -Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required + +The new Slurm configuration can then be read by running `scontrol reconfigure` as root inside a Slurm pod. The [slurm.conf documentation](https://slurm.schedmd.com/slurm.conf.html) notes that some changes require a restart of all daemons, which here requires redeploying the Slurm pods as described below. + +Changes to other configuration files (e.g. Munge key etc) require a redeploy of the appropriate pods. + +To redeploy pods use: +```console +kubectl rollout restart deployment +``` +for the `slurmdbd`, `login` and `mysql` pods and + +``` +kubectl rollout restart statefulset +``` +for the `slurmd` and `slurmctld` pods + +Generally restarts to `slurmd`, `slurmctld`, `login` and `slurmdbd` will be required. ### Changes to secrets @@ -156,3 +177,5 @@ and then restart the other dependent deployments to propagate changes: ```console kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` + +# Known Issues diff --git a/generate-secrets.sh b/generate-secrets.sh deleted file mode 100755 index e98b97e..0000000 --- a/generate-secrets.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -kubectl create secret generic database-auth-secret \ ---dry-run=client \ ---from-literal=password=$(tr -dc 'A-Za-z0-9' /dev/null | base64 -w 0) \ --o yaml | \ -kubectl apply -f - - -mkdir -p ./temphostkeys/etc/ssh -ssh-keygen -A -f ./temphostkeys -kubectl create secret generic host-keys-secret \ ---dry-run=client \ ---from-file=./temphostkeys/etc/ssh \ --o yaml | \ -kubectl apply -f - -rm -rf ./temphostkeys - -OOD_PASS=$(tr -dc 'A-Za-z0-9' Copying MUNGE key ..." + cp /tmp/munge.key /etc/munge/munge.key + chown munge:munge /etc/munge/munge.key + + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged "$@" +} if [ "$1" = "slurmdbd" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged + + start_munge echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." - cp /tempmounts/slurmdbd.conf /etc/slurm/slurmdbd.conf + cp /tmp/slurmdbd.conf /etc/slurm/slurmdbd.conf echo "StoragePass=${StoragePass}" >> /etc/slurm/slurmdbd.conf chown slurm:slurm /etc/slurm/slurmdbd.conf chmod 600 /etc/slurm/slurmdbd.conf @@ -29,13 +32,12 @@ then } echo "-- Database is now active ..." - exec gosu slurm /usr/sbin/slurmdbd -Dvvv -fi + exec gosu slurm /usr/sbin/slurmdbd -D "${@:2}" -if [ "$1" = "slurmctld" ] +elif [ "$1" = "slurmctld" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged + + start_munge echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." @@ -47,17 +49,16 @@ then echo "-- slurmdbd is now active ..." echo "---> Setting permissions for state directory ..." - chown slurm:slurm /var/lib/slurmd + chown slurm:slurm /var/spool/slurmctld echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." if /usr/sbin/slurmctld -V | grep -q '17.02' ; then - exec gosu slurm /usr/sbin/slurmctld -Dvvv + exec gosu slurm /usr/sbin/slurmctld -D "${@:2}" else - exec gosu slurm /usr/sbin/slurmctld -i -Dvvv + exec gosu slurm /usr/sbin/slurmctld -i -D "${@:2}" fi -fi -if [ "$1" = "slurmd" ] +elif [ "$1" = "slurmd" ] then echo "---> Set shell resource limits ..." ulimit -l unlimited @@ -65,8 +66,7 @@ then ulimit -n 131072 ulimit -a - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged + start_munge echo "---> Waiting for slurmctld to become active before starting slurmd..." @@ -78,24 +78,31 @@ then echo "-- slurmctld is now active ..." echo "---> Starting the Slurm Node Daemon (slurmd) ..." - exec /usr/sbin/slurmd -Z -Dvvv -fi + exec /usr/sbin/slurmd -D "${@:2}" -if [ "$1" = "login" ] +elif [ "$1" = "login" ] then + chown root:root /home + chmod 755 /home + + echo "---> Setting up ssh for user" + mkdir -p /home/rocky/.ssh - cp tempmounts/authorized_keys /home/rocky/.ssh/authorized_keys + cp /tmp/authorized_keys /home/rocky/.ssh/authorized_keys echo "---> Setting permissions for user home directories" - cd /home - for DIR in */; - do USER_TO_SET=$( echo $DIR | sed "s/.$//" ) && (chown -R $USER_TO_SET:$USER_TO_SET $USER_TO_SET || echo "Failed to take ownership of $USER_TO_SET") \ - && (chmod 700 /home/$USER_TO_SET/.ssh || echo "Couldn't set permissions for .ssh directory for $USER_TO_SET") \ - && (chmod 600 /home/$USER_TO_SET/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $USER_TO_SET"); + pushd /home > /dev/null + for DIR in * + do + chown -R $DIR:$DIR $DIR || echo "Failed to change ownership of $DIR" + chmod 700 $DIR/.ssh || echo "Couldn't set permissions for .ssh/ directory of $DIR" + chmod 600 $DIR/.ssh/authorized_keys || echo "Couldn't set permissions for .ssh/authorized_keys for $DIR" done + popd > /dev/null + echo "---> Complete" - echo "Starting sshd" + echo "---> Starting sshd" cp /tempmounts/etc/ssh/* /etc/ssh/ chmod 600 /etc/ssh/ssh_host_dsa_key chmod 600 /etc/ssh/ssh_host_ecdsa_key @@ -103,19 +110,25 @@ then chmod 600 /etc/ssh/ssh_host_rsa_key /usr/sbin/sshd - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged - echo "---> MUNGE Complete" + start_munge echo "---> Setting up self ssh capabilities for OOD" + + if [ -f /home/rocky/.ssh/id_rsa.pub ]; then + echo "ssh keys already found" + else + ssh-keygen -t rsa -f /home/rocky/.ssh/id_rsa -N "" + chown rocky:rocky /home/rocky/.ssh/id_rsa /home/rocky/.ssh/id_rsa.pub + fi + ssh-keyscan localhost > /etc/ssh/ssh_known_hosts echo "" >> /home/rocky/.ssh/authorized_keys #Adding newline to avoid breaking authorized_keys file cat /home/rocky/.ssh/id_rsa.pub >> /home/rocky/.ssh/authorized_keys echo "---> Starting Apache Server" - mkdir --parents /etc/ood/config/apps/shell - env > /etc/ood/config/apps/shell/env + # mkdir --parents /etc/ood/config/apps/shell + # env > /etc/ood/config/apps/shell/env /usr/libexec/httpd-ssl-gencerts /opt/ood/ood-portal-generator/sbin/update_ood_portal @@ -123,24 +136,36 @@ then /usr/bin/htdbm -cb /opt/rh/httpd24/root/etc/httpd/.htpasswd.dbm rocky $ROCKY_OOD_PASS /usr/sbin/httpd -k start -X -e debug -fi -if [ "$1" = "check-queue-hook" ] +elif [ "$1" = "check-queue-hook" ] then - echo "---> Starting the MUNGE Authentication service (munged) ..." - gosu munge /usr/sbin/munged - echo "---> MUNGE Complete" + start_munge - RUNNING_JOBS=$(squeue -t pd,r,cg -h -r | wc -l) + RUNNING_JOBS=$(squeue --states=RUNNING,COMPLETING,CONFIGURING,RESIZING,SIGNALING,STAGE_OUT,STOPPED,SUSPENDED --noheader --array | wc --lines) if [[ $RUNNING_JOBS -eq 0 ]] then - echo "No Slurm jobs in queue, can safely upgrade" exit 0 else - echo "Error: cannot upgrade chart - there are still Slurm jobs in the queue" exit 1 fi -fi -exec "$@" +elif [ "$1" = "generate-keys-hook" ] +then + mkdir -p ./temphostkeys/etc/ssh + ssh-keygen -A -f ./temphostkeys + kubectl create secret generic host-keys-secret \ + --dry-run=client \ + --from-file=./temphostkeys/etc/ssh \ + -o yaml | \ + kubectl apply -f - + + exit 0 + +elif [ "$1" = "debug" ] +then + start_munge --foreground + +else + exec "$@" +fi diff --git a/image/kubernetes.repo b/image/kubernetes.repo new file mode 100644 index 0000000..f4ae4ff --- /dev/null +++ b/image/kubernetes.repo @@ -0,0 +1,6 @@ +[kubernetes] +name=Kubernetes +baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch +enabled=1 +gpgcheck=1 +gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index d46b50f..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f crds.yaml -kubectl create -f operator.yaml -kubectl create -f rbac.yaml -kubectl create -f nfs.yaml -kubectl create -f sc.yaml -kubectl create -f pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/publish-keys.sh b/publish-keys.sh index d293e81..bdd4e0f 100755 --- a/publish-keys.sh +++ b/publish-keys.sh @@ -1,3 +1,8 @@ -kubectl create configmap authorized-keys-configmap \ +NAMESPACE="$1" +if [[ -z $1 ]]; then + NAMESPACE=default +fi +echo Installing in namespace $NAMESPACE +kubectl -n $NAMESPACE create configmap authorized-keys-configmap \ "--from-literal=authorized_keys=$(cat ~/.ssh/*.pub)" --dry-run=client -o yaml | \ -kubectl apply -f - \ No newline at end of file +kubectl -n $NAMESPACE apply -f - \ No newline at end of file diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..b8abd25 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: A packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..5b7ad6d --- /dev/null +++ b/rooknfs/README.md @@ -0,0 +1,3 @@ +# RookNFS Helm Chart + +See `values.yaml` for available config options. \ No newline at end of file diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/rooknfs/templates/hooks/pre-delete.yaml b/rooknfs/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..2c75c89 --- /dev/null +++ b/rooknfs/templates/hooks/pre-delete.yaml @@ -0,0 +1,50 @@ +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "10" +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Values.serverNamespace }} nfsservers {{ .Values.serverName }} --wait + restartPolicy: Never +--- \ No newline at end of file diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 56% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..cf7b1de 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -3,30 +3,34 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName }} + namespace: {{ .Values.serverNamespace }} spec: + {{- if .Values.backingStorageClass }} + storageClassName: {{ .Values.backingStorageClass }} + {{- end }} accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 92% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..56318f6 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,14 @@ +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +21,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +107,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +135,4 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 90% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..422a43b 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -2,13 +2,13 @@ apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +51,9 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- \ No newline at end of file diff --git a/nfs/sc.yaml b/rooknfs/templates/sc.yaml similarity index 52% rename from nfs/sc.yaml rename to rooknfs/templates/sc.yaml index 6f9e3ae..505bd44 100644 --- a/nfs/sc.yaml +++ b/rooknfs/templates/sc.yaml @@ -1,13 +1,15 @@ +--- apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: labels: app: rook-nfs - name: rook-nfs-share1 + name: {{ .Values.storageClassName }} parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} provisioner: nfs.rook.io/rook-nfs-provisioner reclaimPolicy: Delete volumeBindingMode: Immediate +--- \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..4ada627 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,28 @@ + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Storage class to use for the Read-Write-Once backing PVC +backingStorageClass: + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..e3d003c 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,10 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: ">=0-0" + repository: file://../rooknfs + condition: rooknfs.enabled diff --git a/slurm-cluster-chart/files/httpd.conf b/slurm-cluster-chart/files/httpd.conf index 6d3783a..248afb2 100644 --- a/slurm-cluster-chart/files/httpd.conf +++ b/slurm-cluster-chart/files/httpd.conf @@ -1,4 +1,4 @@ -# +# Modified from file installed by httpd package # This is the main Apache HTTP server configuration file. It contains the # configuration directives that give the server its instructions. # See for detailed information. diff --git a/slurm-cluster-chart/files/ood_portal.yaml b/slurm-cluster-chart/files/ood_portal.yaml index 4eee040..d5227b2 100644 --- a/slurm-cluster-chart/files/ood_portal.yaml +++ b/slurm-cluster-chart/files/ood_portal.yaml @@ -1,3 +1,4 @@ +# Modified from file installed by ondemand package --- # # Portal configuration @@ -29,6 +30,9 @@ # - 'SSLCertificateKeyFile "/etc/pki/tls/private/www.example.com.key"' # Default: null (no SSL support) #ssl: null +ssl: +- 'SSLCertificateFile "/etc/pki/tls/certs/localhost.crt"' +- 'SSLCertificateKeyFile "/etc/pki/tls/private/localhost.key"' # Root directory of log files (can be relative ServerRoot) # Example: diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index eda002f..a10c12b 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -9,7 +9,7 @@ SlurmUser=slurm SlurmctldPort=6817 SlurmdPort=6818 AuthType=auth/munge -StateSaveLocation=/var/lib/slurmd +StateSaveLocation=/var/spool/slurmctld SlurmdSpoolDir=/var/spool/slurmd SwitchType=switch/none MpiDefault=pmix @@ -20,7 +20,7 @@ ReturnToService=2 # # TIMERS SlurmctldTimeout=300 -SlurmdTimeout=300 +SlurmdTimeout=30 InactiveLimit=0 MinJobAge=300 KillWait=30 @@ -47,10 +47,14 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -# +SlurmctldParameters=cloud_dns,cloud_reg_addrs +CommunicationParameters=NoAddrCache + +# NODES MaxNodeCount=10 +NodeName=slurmd-[0-9] State=FUTURE CPUs=4 + # PARTITIONS PartitionName=all Default=yes Nodes=ALL -TreeWidth=65533 PropagateResourceLimitsExcept=MEMLOCK diff --git a/slurm-cluster-chart/templates/database-auth-secret.yaml b/slurm-cluster-chart/templates/database-auth-secret.yaml new file mode 100644 index 0000000..1a1d6ea --- /dev/null +++ b/slurm-cluster-chart/templates/database-auth-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: database-auth-secret + annotations: + helm.sh/hook: pre-install +data: + password: {{ randAlphaNum 32 | b64enc }} diff --git a/slurm-cluster-chart/templates/generate-keys-hook.yaml b/slurm-cluster-chart/templates/generate-keys-hook.yaml new file mode 100644 index 0000000..c05e7f2 --- /dev/null +++ b/slurm-cluster-chart/templates/generate-keys-hook.yaml @@ -0,0 +1,22 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: generate-keys-hook + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "3" +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 0 + template: + metadata: + name: generate-keys-hook + spec: + serviceAccountName: secret-generator-account + restartPolicy: Never + containers: + - name: generate-keys-hook + image: {{ .Values.slurmImage }} + args: + - generate-keys-hook diff --git a/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml new file mode 100644 index 0000000..75ad249 --- /dev/null +++ b/slurm-cluster-chart/templates/helm-authorized-keys-configmap.yaml @@ -0,0 +1,9 @@ +#Only applied if sshPublicKey provided in values.yaml, if not assumes you have run publish-keys.sh prior to helm release +{{ if .Values.sshPublicKey }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: helm-authorized-keys-configmap +data: + authorized_keys: {{ .Values.sshPublicKey }} +{{ end }} diff --git a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml similarity index 83% rename from slurm-cluster-chart/templates/check-jobs-finished-hook.yaml rename to slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml index be70975..79e93eb 100644 --- a/slurm-cluster-chart/templates/check-jobs-finished-hook.yaml +++ b/slurm-cluster-chart/templates/hooks/check-jobs-finished-hook.yaml @@ -15,20 +15,20 @@ spec: restartPolicy: Never containers: - name: check-jobs-finished-hook - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} args: - check-queue-hook volumeMounts: - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf volumes: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..868cbbd --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slurm-k8s-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: slurm-k8s-cleanup +subjects: +- kind: ServiceAccount + name: slurm-k8s-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: slurm-k8s-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded + "helm.sh/hook-weight": "1" +spec: + template: + metadata: + name: slurm-k8s-pre-delete-cleanup + spec: + serviceAccountName: slurm-k8s-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/login-service.yaml b/slurm-cluster-chart/templates/login-service.yaml index fee3480..df8892d 100644 --- a/slurm-cluster-chart/templates/login-service.yaml +++ b/slurm-cluster-chart/templates/login-service.yaml @@ -8,13 +8,17 @@ metadata: name: login spec: ports: - - name: "ssh" + - name: ssh port: 22 targetPort: 22 - - name: "apache" + - name: apache port: 80 targetPort: 80 protocol: TCP + - name: https + port: 443 + targetPort: 443 + protocol: TCP type: LoadBalancer selector: app.kubernetes.io/name: slurm diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 77% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 1c795ea..52cc83e 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -24,31 +24,28 @@ spec: containers: - args: - login - image: {{ .Values.sdcImage }} + image: {{ .Values.slurmImage }} name: login env: - name: ROCKY_OOD_PASS - valueFrom: - secretKeyRef: - name: htdbm-secret - key: password + value: {{ .Values.openOnDemand.password }} ports: - containerPort: 22 - containerPort: 80 + - containerPort: 443 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - mountPath: /etc/ssh/sshd_config subPath: sshd_config name: sshd-config-configmap - name: authorized-keys - mountPath: /tempmounts/authorized_keys + mountPath: /tmp/authorized_keys subPath: authorized_keys - name: ood-portal mountPath: /etc/ood/config/ood_portal.yml @@ -66,11 +63,14 @@ spec: mountPath: /tempmounts/etc/ssh resources: {} hostname: login + dnsConfig: + searches: + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} @@ -80,9 +80,14 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 - name: authorized-keys configMap: - name: {{ .Values.configmaps.authorizedKeys }} + {{ if .Values.sshPublicKey }} + name: helm-authorized-keys-configmap + {{ else }} + name: authorized-keys-configmap + {{ end }} - name: cluster-config configMap: name: cluster-config diff --git a/slurm-cluster-chart/templates/munge-key-secret.yaml b/slurm-cluster-chart/templates/munge-key-secret.yaml new file mode 100644 index 0000000..df97e19 --- /dev/null +++ b/slurm-cluster-chart/templates/munge-key-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: munge-key-secret + annotations: + helm.sh/hook: pre-install +data: + munge.key: {{ randAscii 128 | b64enc }} diff --git a/slurm-cluster-chart/templates/mysql-deployment.yaml b/slurm-cluster-chart/templates/mysql-deployment.yaml index 8ffd49e..96dc88f 100644 --- a/slurm-cluster-chart/templates/mysql-deployment.yaml +++ b/slurm-cluster-chart/templates/mysql-deployment.yaml @@ -34,7 +34,7 @@ spec: value: "yes" - name: MYSQL_USER value: "slurm" - image: {{ .Values.sqlImage }} + image: {{ .Values.database.image }} name: mysql ports: - containerPort: 3306 diff --git a/slurm-cluster-chart/templates/mysql-service.yaml b/slurm-cluster-chart/templates/mysql-service.yaml index 349dfee..a7d58cc 100644 --- a/slurm-cluster-chart/templates/mysql-service.yaml +++ b/slurm-cluster-chart/templates/mysql-service.yaml @@ -8,7 +8,7 @@ metadata: name: mysql spec: ports: - - name: "3306" + - name: mysql port: 3306 targetPort: 3306 selector: diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..aab0856 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storage.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-role.yaml b/slurm-cluster-chart/templates/secret-generator-role.yaml new file mode 100644 index 0000000..da914be --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-role.yaml @@ -0,0 +1,28 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: secret-generator-role + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "1" +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["secrets"] + verbs: ["get","apply","create", "patch"] + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: secret-generator-rolebinding + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "2" +subjects: + - kind: ServiceAccount + name: secret-generator-account +roleRef: + kind: Role + name: secret-generator-role + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml new file mode 100644 index 0000000..ce860b0 --- /dev/null +++ b/slurm-cluster-chart/templates/secret-generator-serviceaccount.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: secret-generator-account + annotations: + "kubernetes.io/enforce-mountable-secrets": "true" + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "0" +automountServiceAccountToken: True +secrets: + - name: host-keys-secret diff --git a/slurm-cluster-chart/templates/slurmctld-service.yaml b/slurm-cluster-chart/templates/slurmctld-service.yaml index 9bfc40b..001bcab 100644 --- a/slurm-cluster-chart/templates/slurmctld-service.yaml +++ b/slurm-cluster-chart/templates/slurmctld-service.yaml @@ -8,7 +8,7 @@ metadata: name: slurmctld-0 spec: ports: - - name: "6817" + - name: slurmctld port: 6817 targetPort: 6817 selector: diff --git a/slurm-cluster-chart/templates/slurmctld-deployment.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 66% rename from slurm-cluster-chart/templates/slurmctld-deployment.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index d42e425..1644463 100644 --- a/slurm-cluster-chart/templates/slurmctld-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -22,34 +22,37 @@ spec: containers: - args: - slurmctld - image: {{ .Values.sdcImage }} + - -vvv + image: {{ .Values.slurmImage }} name: slurmctld ports: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /var/lib/slurmd - name: var-lib-slurmd - hostname: slurmctld + - mountPath: /var/spool/slurmctld + name: slurmctld-state + dnsConfig: + searches: + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} - - name: var-lib-slurmd + claimName: {{ .Values.storage.claimName }} + - name: slurmctld-state persistentVolumeClaim: - claimName: var-lib-slurmd + claimName: var-spool-slurmctld - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 diff --git a/slurm-cluster-chart/templates/slurmd-service.yaml b/slurm-cluster-chart/templates/slurmd-service.yaml index bec3d90..b5884fc 100644 --- a/slurm-cluster-chart/templates/slurmd-service.yaml +++ b/slurm-cluster-chart/templates/slurmd-service.yaml @@ -8,9 +8,10 @@ metadata: name: slurmd spec: ports: - - name: "6818" + - name: slurmd port: 6818 targetPort: 6818 selector: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd + clusterIP: None diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd.yaml similarity index 58% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd.yaml index 55f0a5e..bec55ce 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -1,19 +1,18 @@ apiVersion: apps/v1 -kind: Deployment +kind: StatefulSet metadata: creationTimestamp: null labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd + name: {{ .Values.slurmd.name }} spec: - replicas: {{ .Values.replicas.slurmd }} + replicas: {{ .Values.slurmd.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - strategy: - type: Recreate + serviceName: slurmd template: metadata: creationTimestamp: null @@ -21,41 +20,48 @@ spec: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd spec: - topologySpreadConstraints: - - maxSkew: 1 - whenUnsatisfiable: ScheduleAnyway - topologyKey: kubernetes.io/hostname - labelSelector: - matchLabels: - app.kubernetes.io/name: slurm - app.kubernetes.io/component: slurmd containers: - args: - slurmd - image: {{ .Values.sdcImage }} + - -F + - -vvv + - -N + - "$(POD_NAME)" + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + image: {{ .Values.slurmImage }} name: slurmd ports: - containerPort: 6818 + hostPort: 6818 resources: {} volumeMounts: - - mountPath: /etc/slurm/slurm.conf + - mountPath: /etc/slurm/ name: slurm-config-volume - subPath: slurm.conf - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key securityContext: privileged: true + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + dnsConfig: + searches: + - slurmd.{{ .Release.Namespace }}.svc.cluster.local restartPolicy: Always volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} + defaultMode: 0400 diff --git a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml index 2842de0..db6bdb5 100644 --- a/slurm-cluster-chart/templates/slurmdbd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-deployment.yaml @@ -24,7 +24,8 @@ spec: containers: - args: - slurmdbd - image: {{ .Values.sdcImage }} + - -vvv + image: {{ .Values.slurmImage }} name: slurmdbd ports: - containerPort: 6819 @@ -33,10 +34,10 @@ spec: - mountPath: /etc/slurm/slurm.conf name: slurm-config-volume subPath: slurm.conf - - mountPath: /tempmounts/munge.key + - mountPath: /tmp/munge.key name: munge-key-secret subPath: munge.key - - mountPath: /tempmounts/slurmdbd.conf + - mountPath: /tmp/slurmdbd.conf name: dbd-config-volume subPath: slurmdbd.conf env: @@ -57,4 +58,4 @@ spec: - name: munge-key-secret secret: secretName: {{ .Values.secrets.mungeKey }} - \ No newline at end of file + defaultMode: 0400 diff --git a/slurm-cluster-chart/templates/slurmdbd-service.yaml b/slurm-cluster-chart/templates/slurmdbd-service.yaml index 400dcda..fc0ec8f 100644 --- a/slurm-cluster-chart/templates/slurmdbd-service.yaml +++ b/slurm-cluster-chart/templates/slurmdbd-service.yaml @@ -8,7 +8,7 @@ metadata: name: slurmdbd spec: ports: - - name: "6819" + - name: slurmdbd port: 6819 targetPort: 6819 selector: diff --git a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml index 841bb0f..a5f4503 100644 --- a/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml +++ b/slurm-cluster-chart/templates/var-lib-mysql-persistentvolumeclaim.yaml @@ -11,4 +11,4 @@ spec: - ReadWriteOnce resources: requests: - storage: {{ .Values.databaseStorage }} + storage: {{ .Values.database.storage }} diff --git a/slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml b/slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml similarity index 82% rename from slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml rename to slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml index 5879b34..de733a0 100644 --- a/slurm-cluster-chart/templates/var-lib-slurmd-pvcclaim.yaml +++ b/slurm-cluster-chart/templates/var-spool-slurmctld-pvcclaim.yaml @@ -5,10 +5,10 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: var-lib-slurmd + name: var-spool-slurmctld spec: accessModes: - ReadWriteOnce resources: requests: - storage: 1Gi + storage: 100Mi diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 93c606c..3d41248 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,24 +1,79 @@ -sdcImage: ghcr.io/stackhpc/slurm-docker-cluster:a89e584 +slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:d3daba4 -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd # NB this must match NodeName= in slurm-cluster-chart/files/slurm.conf + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi # NB yaml anchor used so this value is also set for `rooknfs.storageCapacity` if necessary. + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + # Name given to the RWM StorageClass created by Rook + # NB this must match storage.storageClassName when using Rook + storageClassName: slurm-rook-nfs + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + # Storage class to use for the Read-Write-Once backing PVC + # backingStorageClass: -sqlImage: mariadb:10.10 -databaseStorage: 100Mi +# Values for Slurm's database container +database: + #Database image to be used + image: mariadb:10.10 + #Storage requested by the var-lib-mysql volume backing the database + storage: 100Mi + +# Configmap resource names configmaps: - authorizedKeys: authorized-keys-configmap slurmConf: slurm-conf-configmap slurmdbdConf: slurmdbd-conf-configmap sshdConfig: sshd-config-configmap +# Public key used for ssh access to the login node +# If let undefined, assumes you have run the provided publish-keys.sh script to publish your public key prior to deployment +sshPublicKey: + +# Secret resource names secrets: - databaseAuth: database-auth-secret mungeKey: munge-key-secret - \ No newline at end of file + +openOnDemand: + #Password for default Open OnDemand user 'rocky' + password: password