diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ba5327 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Build artifacts from local helm install +slurm-cluster-chart/Chart.lock +slurm-cluster-chart/charts/ diff --git a/nfs/deploy-nfs.sh b/nfs/deploy-nfs.sh deleted file mode 100755 index b2d2f75..0000000 --- a/nfs/deploy-nfs.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Based on https://rook.io/docs/nfs/v1.7/quickstart.html -# Manifests listed explicitly here to guarantee ordering - -kubectl create -f nfs/crds.yaml -kubectl create -f nfs/operator.yaml -kubectl create -f nfs/rbac.yaml -kubectl create -f nfs/nfs.yaml -kubectl create -f nfs/sc.yaml -kubectl create -f nfs/pvc.yaml diff --git a/nfs/pvc.yaml b/nfs/pvc.yaml deleted file mode 100644 index 7f0a3d7..0000000 --- a/nfs/pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: rook-nfs-pv-claim -spec: - storageClassName: "rook-nfs-share1" - accessModes: - - ReadWriteMany - resources: - requests: - storage: 10Gi diff --git a/nfs/sc.yaml b/nfs/sc.yaml deleted file mode 100644 index 6f9e3ae..0000000 --- a/nfs/sc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - labels: - app: rook-nfs - name: rook-nfs-share1 -parameters: - exportName: share1 - nfsServerName: rook-nfs - nfsServerNamespace: rook-nfs -provisioner: nfs.rook.io/rook-nfs-provisioner -reclaimPolicy: Delete -volumeBindingMode: Immediate diff --git a/nfs/teardown-nfs.sh b/nfs/teardown-nfs.sh deleted file mode 100755 index 4dde364..0000000 --- a/nfs/teardown-nfs.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -kubectl delete -f web-service.yaml -kubectl delete -f web-rc.yaml -kubectl delete -f busybox-rc.yaml -kubectl delete -f pvc.yaml -kubectl delete -f pv.yaml -kubectl delete -f nfs.yaml -kubectl delete -f nfs-xfs.yaml -kubectl delete -f nfs-ceph.yaml -kubectl delete -f rbac.yaml -kubectl delete -f psp.yaml -kubectl delete -f scc.yaml # if deployed -kubectl delete -f operator.yaml -kubectl delete -f webhook.yaml # if deployed -kubectl delete -f crds.yaml diff --git a/rooknfs/Chart.yaml b/rooknfs/Chart.yaml new file mode 100644 index 0000000..83a2a11 --- /dev/null +++ b/rooknfs/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: rooknfs +version: 0.0.1 +description: An packaged installation of Rook NFS for Kubernetes. \ No newline at end of file diff --git a/rooknfs/README.md b/rooknfs/README.md new file mode 100644 index 0000000..e69de29 diff --git a/nfs/crds.yaml b/rooknfs/crds/crds.yaml similarity index 100% rename from nfs/crds.yaml rename to rooknfs/crds/crds.yaml diff --git a/nfs/nfs.yaml b/rooknfs/templates/nfs.yaml similarity index 61% rename from nfs/nfs.yaml rename to rooknfs/templates/nfs.yaml index 742fa34..6fde553 100644 --- a/nfs/nfs.yaml +++ b/rooknfs/templates/nfs.yaml @@ -1,32 +1,36 @@ +{{- if .Values.enabled }} --- # A default storageclass must be present apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: nfs-default-claim - namespace: rook-nfs + name: {{ .Values.claimName}} + namespace: {{ .Values.serverNamespace }} spec: accessModes: - ReadWriteMany resources: requests: - storage: 1Gi + storage: {{ .Values.storageCapacity }} --- apiVersion: nfs.rook.io/v1alpha1 kind: NFSServer metadata: - name: rook-nfs - namespace: rook-nfs + name: {{ .Values.serverName }} + namespace: {{ .Values.serverNamespace }} spec: replicas: 1 exports: - - name: share1 + - name: {{ .Values.shareName }} server: accessMode: ReadWrite squash: "none" # A Persistent Volume Claim must be created before creating NFS CRD instance. persistentVolumeClaim: - claimName: nfs-default-claim + claimName: {{ .Values.claimName }} # A key/value list of annotations annotations: rook: nfs +--- +{{- end }} + diff --git a/nfs/operator.yaml b/rooknfs/templates/operator.yaml similarity index 91% rename from nfs/operator.yaml rename to rooknfs/templates/operator.yaml index b289909..4a1d542 100644 --- a/nfs/operator.yaml +++ b/rooknfs/templates/operator.yaml @@ -1,13 +1,15 @@ +{{- if .Values.enabled }} +--- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs-system # namespace:operator + name: {{ .Values.systemNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -20,7 +22,7 @@ roleRef: subjects: - kind: ServiceAccount name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -106,7 +108,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: rook-nfs-operator - namespace: rook-nfs-system # namespace:operator + namespace: {{ .Values.systemNamespace }} labels: app: rook-nfs-operator spec: @@ -134,3 +136,5 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +--- +{{- end}} diff --git a/nfs/rbac.yaml b/rooknfs/templates/rbac.yaml similarity index 88% rename from nfs/rbac.yaml rename to rooknfs/templates/rbac.yaml index 8e3d9f7..b327740 100644 --- a/nfs/rbac.yaml +++ b/rooknfs/templates/rbac.yaml @@ -1,14 +1,15 @@ +{{- if .Values.enabled }} --- apiVersion: v1 kind: Namespace metadata: - name: rook-nfs + name: {{ .Values.serverNamespace }} --- apiVersion: v1 kind: ServiceAccount metadata: name: rook-nfs-server - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 @@ -51,9 +52,10 @@ metadata: subjects: - kind: ServiceAccount name: rook-nfs-server - # replace with namespace where provisioner is deployed - namespace: rook-nfs + namespace: {{ .Values.serverNamespace }} roleRef: kind: ClusterRole name: rook-nfs-provisioner-runner apiGroup: rbac.authorization.k8s.io +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/templates/sc.yaml b/rooknfs/templates/sc.yaml new file mode 100644 index 0000000..0ad75fe --- /dev/null +++ b/rooknfs/templates/sc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.enabled }} +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + labels: + app: rook-nfs + name: {{ .Values.storageClassName }} +parameters: + exportName: {{ .Values.shareName }} + nfsServerName: {{ .Values.serverName }} + nfsServerNamespace: {{ .Values.serverNamespace }} +provisioner: nfs.rook.io/rook-nfs-provisioner +reclaimPolicy: Delete +volumeBindingMode: Immediate +--- +{{- end }} \ No newline at end of file diff --git a/rooknfs/values.yaml b/rooknfs/values.yaml new file mode 100644 index 0000000..1961fa6 --- /dev/null +++ b/rooknfs/values.yaml @@ -0,0 +1,30 @@ +# Global flag for enabling/disabling all chart resources +# This is useful for allowing charts which use this chart +# as a dependency to toggle usage of this chart based on +# values in the parent chart +enabled: true + +# Name for the NFSServer resource created by rook +serverName: rook-nfs + +# Name for the created storage class +storageClassName: rook-nfs + +# Name for the Read-Write-Once backing PVC created by Rook +claimName: rook-nfs-backing-pv + +# Name for the NFS share within the NFS Resource instance +shareName: share-1 + +# Size of the Read-Write-Once backing storage volume +storageCapacity: 10Gi + +# Image to use for the Rook NFS operator +operatorImage: rook/nfs:master + +# NOTE: For some reason deploying everything in the default +# namespace leads to R-W-M PVCs getting stuck in 'pending' +# state indefinitely, so here we separate out namespaces as +# of various components in the same way as the Rook docs +serverNamespace: rook-nfs +systemNamespace: rook-nfs-system \ No newline at end of file diff --git a/slurm-cluster-chart/Chart.yaml b/slurm-cluster-chart/Chart.yaml index 9e592c0..4dad59b 100644 --- a/slurm-cluster-chart/Chart.yaml +++ b/slurm-cluster-chart/Chart.yaml @@ -21,4 +21,9 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" \ No newline at end of file +appVersion: "1.16.0" + +dependencies: + - name: rooknfs + version: 0.0.1 + repository: file://../rooknfs \ No newline at end of file diff --git a/slurm-cluster-chart/templates/hooks/pre-delete.yaml b/slurm-cluster-chart/templates/hooks/pre-delete.yaml new file mode 100644 index 0000000..8cdb1f3 --- /dev/null +++ b/slurm-cluster-chart/templates/hooks/pre-delete.yaml @@ -0,0 +1,55 @@ +{{- if .Values.rooknfs.enabled }} +# NOTE: The cleanup jobs defined here are required to ensure that things which +# Rook NFS is responsible for cleaning up are deleted before deleting the Rook +# pods which do the actual clean up of NFS resources. For example, the RWM PVC +# must be deleted before the Rook StorageClass and provisioner pod. However, +# the PVC cannot be deleted until the pods which are using it are deleted, so +# the various Slurm node pods must actually be the first resources deleted. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rook-nfs-cleanup +--- +# TODO: Create a job-specific ClusterRole for the ServiceAccount +# instead of using the cluster-admin role here +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: rook-nfs-cleanup +subjects: +- kind: ServiceAccount + name: rook-nfs-cleanup + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: cluster-admin +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: rook-nfs-pre-delete-cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: rook-nfs-pre-delete-cleanup + spec: + serviceAccountName: rook-nfs-cleanup + containers: + - name: tester + image: bitnami/kubectl + command: + - "bin/bash" + - "-c" + - | + kubectl delete -n {{ .Release.Namespace }} deployment {{ .Values.login.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmctld.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} statefulset {{ .Values.slurmd.name }} --wait --cascade=foreground + kubectl delete -n {{ .Release.Namespace }} pvc {{ .Values.storage.claimName }} --wait + kubectl delete -n {{ .Values.rooknfs.serverNamespace }} nfsservers {{ .Values.rooknfs.serverName }} --wait + restartPolicy: Never +--- +{{- end }} diff --git a/slurm-cluster-chart/templates/login-deployment.yaml b/slurm-cluster-chart/templates/login.yaml similarity index 90% rename from slurm-cluster-chart/templates/login-deployment.yaml rename to slurm-cluster-chart/templates/login.yaml index 48f8f17..ca63392 100644 --- a/slurm-cluster-chart/templates/login-deployment.yaml +++ b/slurm-cluster-chart/templates/login.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: login - name: login + name: {{ .Values.login.name }} spec: - replicas: {{ .Values.replicas.login }} + replicas: {{ .Values.login.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -29,7 +29,7 @@ spec: ports: - containerPort: 22 volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -51,7 +51,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/templates/pvc.yaml b/slurm-cluster-chart/templates/pvc.yaml new file mode 100644 index 0000000..c5d5955 --- /dev/null +++ b/slurm-cluster-chart/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.rooknfs.enabled }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.storage.claimName }} +spec: + storageClassName: {{ .Values.storageClassName }} + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.storage.capacity }} +{{- end }} \ No newline at end of file diff --git a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml b/slurm-cluster-chart/templates/slurmctld.yaml similarity index 91% rename from slurm-cluster-chart/templates/slurmctld-statefulset.yaml rename to slurm-cluster-chart/templates/slurmctld.yaml index dc0bf90..f919c5f 100644 --- a/slurm-cluster-chart/templates/slurmctld-statefulset.yaml +++ b/slurm-cluster-chart/templates/slurmctld.yaml @@ -5,7 +5,7 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmctld - name: slurmctld + name: {{ .Values.slurmctld.name }} spec: replicas: 1 selector: @@ -29,7 +29,7 @@ spec: - containerPort: 6817 resources: {} volumeMounts: - - mountPath: {{ .Values.nfs.mountPath }} + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /etc/slurm/ name: slurm-config-volume @@ -45,7 +45,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurmctld-state persistentVolumeClaim: claimName: var-spool-slurmctld diff --git a/slurm-cluster-chart/templates/slurmd-deployment.yaml b/slurm-cluster-chart/templates/slurmd.yaml similarity index 88% rename from slurm-cluster-chart/templates/slurmd-deployment.yaml rename to slurm-cluster-chart/templates/slurmd.yaml index 4c2396e..4775748 100644 --- a/slurm-cluster-chart/templates/slurmd-deployment.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -5,9 +5,9 @@ metadata: labels: app.kubernetes.io/name: slurm app.kubernetes.io/component: slurmd - name: slurmd + name: {{ .Values.slurmd.name }} spec: - replicas: {{ .Values.replicas.slurmd }} + replicas: {{ .Values.slurmd.replicas }} selector: matchLabels: app.kubernetes.io/name: slurm @@ -41,7 +41,8 @@ spec: volumeMounts: - mountPath: /etc/slurm/ name: slurm-config-volume - - mountPath: {{ .Values.nfs.mountPath }} + subPath: slurm.conf + - mountPath: {{ .Values.storage.mountPath }} name: slurm-jobdir - mountPath: /tmp/munge.key name: munge-key-secret @@ -55,7 +56,7 @@ spec: volumes: - name: slurm-jobdir persistentVolumeClaim: - claimName: {{ .Values.nfs.claimName }} + claimName: {{ .Values.storage.claimName }} - name: slurm-config-volume configMap: name: {{ .Values.configmaps.slurmConf }} diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 7873e5c..eb9501c 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,12 +1,52 @@ slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:7f4d64e -replicas: - slurmd: 2 - login: 1 +login: + # Deployment resource name + name: login + replicas: 1 -nfs: +slurmd: + # StatefulSet resource name + name: slurmd + replicas: 2 + +slurmctld: + # StatefulSet resource name + name: slurmctld + # NOTE: We don't include a replicas field here because + # replicas > 1 for slurmctld needs extra Slurm config + +storage: mountPath: /home - claimName: rook-nfs-pv-claim + # The name of a Read-Write-Many StorageClass to use for + # the persistent volume which is shared across Slurm nodes + # Note: If using the default value then you must set + # rooknfs.enabled = true below to ensure that Rook NFS is + # installed on the cluster as a dependency of this Slurm + # chart. If you are using a separate RWM StorageClass, then + # set rooknfs.enabled = false + storageClassName: &storageclassname slurm-rook-nfs + # Name for the R-W-M volume to provision + claimName: slurm-shared-storage + # Capacite of the R-W-M volume + capacity: &capacity 10Gi + + +# Values to be passed to the rook-nfs sub-chart +# See rook-nfs sub-chart for full set of available config values +rooknfs: + enabled: true + storageClassName: *storageclassname + # Name for the NFSServer resource created by Rook + serverName: rook-nfs + # Capacity for the backing Read-Write-*Once* volume + # than Rook will create to provide the actual storage to + # the NFS server. Since we're using the Rook NFS in a + # slightly unconventional way here, we just want to anchor + # this value to the requested storage capacity for the RWM + # volume specified in storage.capacity + storageCapacity: *capacity + sqlImage: mariadb:10.10