2i2c-org · sgibson91 · Dec 19, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml
@@ -32,6 +32,18 @@ redirects:
       to: staging.hub.openveda.cloud
 
 prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
   server:
     ingress:
       enabled: true
@@ -41,6 +53,20 @@ prometheus:
         - secretName: prometheus-tls
           hosts:
             - prometheus.nasa-veda.2i2c.cloud
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: NASA VEDA jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-veda
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 
 aws-ce-grafana-backend:
   enabled: true

diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md
@@ -138,6 +138,62 @@ deployer deploy <cluster_name> <hub_name>
 
 Once this is deployed, the hub will automatically enforce the storage quota for each user. If a user's home directory exceeds the quota, the user's pod may not be able to start successfully.
 
+## Enabling alerting through Prometheus Alertmanager
+
+Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action.
+
+To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. 
+
+First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)).
+
+```yaml
+prometheus:
+  alertmanager:
+    enabled: true
+```
+
+Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file:
+
+```yaml
+prometheus:
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: <cluster_name> jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: <cluster_name>
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+```
+
+And finally, we need to configure Alertmanager to send alerts to PagerDuty.
+
+```yaml
+prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+```
+
+## Increasing the size of the volume used by the NFS server
+
+If the volume used by the NFS server is close to being full, we may need to increase the size of the volume. This can be done by following the instructions in the [Increase the size of an AWS EBS volume](howto:increase-size-aws-ebs) guide.
+
 ## Troubleshooting
 
 ### Checking the NFS server is running properly

diff --git a/docs/howto/filesystem-management/increase-size-aws-ebs.md b/docs/howto/filesystem-management/increase-size-aws-ebs.md
@@ -0,0 +1,18 @@
+(howto:increase-size-aws-ebs)=
+# Increase the size of an AWS EBS volume
+
+To increase the size of an AWS EBS volume, we need to increase the size of the EBS volume in the [tfvars file of the hub](https://github.com/2i2c-org/infrastructure/tree/main/terraform/aws/projects):
+
+For example, to increase the size of the EBS volume used by `jupyterhub-home-nfs` for the `staging` hub in the `nasa-veda` cluster, we would increase the `size` parameter in the `ebs_volumes` block for the `staging` hub in the [tfvars file for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/terraform/aws/projects/nasa-veda.tfvars).
+
+After updating the tfvars file, we need to plan and apply the changes using terraform:
+
+   ```bash
+   cd terraform/aws
+   terraform plan -var-file=projects/$CLUSTER_NAME.tfvars
+   terraform apply -var-file=projects/$CLUSTER_NAME.tfvars
+   ```
+
+```{note}
+The size of an EBS volume can only be increased, not decreased.
+```
diff --git a/docs/howto/filesystem-management/index.md b/docs/howto/filesystem-management/index.md
@@ -7,4 +7,5 @@ This documentation covers tasks related to managing filesystems.
 :maxdepth: 2
 filesystem-backups/index
 decrease-size-gcp-filestore
+increase-size-aws-ebs
 ```
diff --git a/helm-charts/support/enc-support.secret.values.yaml b/helm-charts/support/enc-support.secret.values.yaml
@@ -1,20 +1,24 @@
 grafana:
-  adminPassword: ENC[AES256_GCM,data:B+8/JqM4p6FytAp+8Ec4qzVKANZTVGuaHKMm7vyamhA2Wid5VBiRfIzghPXRxOakPtp8iTXqdO0I/AVu+HQDuA==,iv:ojVQ9u+cCGJeo/e8MbX4LKfKejXq/kjB2wtpjpNsCHA=,tag:Uggw1CS1+fmQdN0GsLhN9w==,type:str]
-pagerduty-prometheus-integration:
-  integrationName: ENC[AES256_GCM,data:REEthqjPY3MEnA==,iv:KTals1+rPcGMAlT8zR5f6Y5RbhWuR9+QkxFQ59L7u/U=,tag:PynUx7lYUgcEmsLsnP+Z3g==,type:str]
-  integrationKey: ENC[AES256_GCM,data:RRwGXmQghrayoyHGnuPgnH7mkTA68m10ZHyCOn6fJMw=,iv:gLrxOnT6hvAKHSpC6x2mZxfHL/8O00QJU+BBMFo77S0=,tag:bE0YoHvelMLFDWUwSDDPvA==,type:str]
-  integrationUrl: ENC[AES256_GCM,data:+7kaZ0QGYlyrUpjBBDyZVvnZPIbyjBOUP8FP8DhwhVcSpIdOJn4Koq8NvVJxuTMBgbJiIn3JuVB1bkJCUIVqfW0=,iv:L5QgYp3IvKmwd1GZuv7n0nIXwcrtKeGx7wY+XSDvjjw=,tag:zFhdkN4XXmnaV8xyuW5+fQ==,type:str]
+  adminPassword: ENC[AES256_GCM,data:oNCAulbNDA7g4jJ3G9j7I5Uqd/XaKo5MvatryHVfat+2bjcqFOlpdHfiZsVZNgNP8kxMNwAlkeLJ7BGlFqYA2g==,iv:lJlTLyO9bvDp0zsE+dZQh1thfE7IfnABhAeynKLDUhA=,tag:jw3PN8OVr7+zl4ThdDlAAw==,type:str]
+prometheus:
+  alertmanager:
+    config:
+      receivers:
+        - name: ENC[AES256_GCM,data:PuB35BjALacz,iv:39j9vTvzB1IB2pEZi+psoAv9FDMikOjrxps6+yxpLEQ=,tag:9apUjoupQkG6W9hsaZ6QHg==,type:str]
+          pagerduty_configs:
+            - ENC[AES256_GCM,data:X91fQd0gulxQkTtfMk48RK4EQ0gHJXrBaUeEGZmNYiGZWuGFm7Hf7Cj+yO4Fg51vzqMDwGIVhP3LH80PQbzjo96qJjzVJwAF7SK30eTj9iEcOg==,iv:ymurqszkJ2xB5dj66EAuytARi9mS2oKMdxfbnRzwgP4=,tag:4fYp5OKFiLm2usIqPNN1lQ==,type:comment]
+            - service_key: ENC[AES256_GCM,data:bCMB2VURBRRPvrV542HQMUGHs9kicvwcD0maVqxuH2I=,iv:NTewUsy4xZNsA9xWVr1Yd62Z2ubtOVMe86erwstXgu4=,tag:DfPqZP1fR/sTNJfVuaCSwA==,type:str]
 sops:
   kms: []
   gcp_kms:
     - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs
-      created_at: "2024-12-05T10:40:02Z"
-      enc: CiUA4OM7eJ4G+2WdZ9oThEMrrlJSek0IrnRDykmPQYXW99zmb8HQEkkAnGhyNt6qUGehjrI1ovZKXU0p+cr5JwiFBA8PQs/3FR4Rdcy1ZX45Ed0sModB4VvWQlG9WgMC+75fzStuUUZUDtGsV9xvZeM0
+      created_at: "2024-12-18T10:09:21Z"
+      enc: CiUA4OM7eOThrN3NqDXYd4belMxUdYfF8XpeUSYfiefKkYutN2KoEkkAnGhyNqzzrXyUXTvtS4xl4IjlJpo6hwm9FVbQAyh2Vw9dcXH6h1+NNUHGPYj6KQJvvWZzgXHFLiWdqOfUgTJME0YDx/DYgXgm
   azure_kv: []
   hc_vault: []
   age: []
-  lastmodified: "2024-12-05T10:40:02Z"
-  mac: ENC[AES256_GCM,data:6rssn9qvZh2ekypkcMmGhoNwGjhGqs85Ld1R1zkIpwx670rTubJlvmFsyslGuqU4JJJ1aJ8A/KKtlMJMjhyGpLGA1krKWf9tgV0E3G/GAk2kR6h/lbdCfPhR+DsHdOAlWuHZTFz2bJuBKboi1znoNoOSKI+32WU6kFNnkcEqgTI=,iv:EAfU8mGbWegtCqu0M4DAjy29cEOJBMgM8/32vRAZf4Y=,tag:E0FtNKt80cRcw9kRXMjW7w==,type:str]
+  lastmodified: "2024-12-18T10:09:21Z"
+  mac: ENC[AES256_GCM,data:FLVIB1iZCSfuCwAiL0wBVHik2kO2xfR9iV0TgW2skPWMNJA+RWPhyHk8Ma3SsH/7iSOR8acwrFZ0b6DBhAtTtQEV1DGdSm9LvdHE2hkG7oFZzkdXYaC2YcwwkVJzfEmwZIn7OECXTL2W2I/5ZUEcs0bUW/5YlKmMzphtt9DL5o4=,iv:NdMEKBn90DyGgkrNYG8VKhhe4S+H57XM6AgAqJWA1q4=,tag:0lLWwN5M9jXmV8EzP/Y7nw==,type:str]
   pgp: []
   unencrypted_suffix: _unencrypted
   version: 3.9.1