From 90502dfb9ceb2ba47523484a9b4f94d4c2b17521 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Tue, 17 Dec 2024 17:17:57 +0530 Subject: [PATCH 1/7] Prometheus Alertmanager config for PagerDuty Send alerts to PagerDuty when jupyterhub-home-nfs disk usage is above 90% --- config/clusters/nasa-veda/support.values.yaml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml index b43e07e4d..0e8c9baf6 100644 --- a/config/clusters/nasa-veda/support.values.yaml +++ b/config/clusters/nasa-veda/support.values.yaml @@ -32,6 +32,18 @@ redirects: to: staging.hub.openveda.cloud prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty server: ingress: enabled: true @@ -41,6 +53,19 @@ prometheus: - secretName: prometheus-tls hosts: - prometheus.nasa-veda.2i2c.cloud + serverFiles: + alerting_rules.yml: + groups: + - name: NASA VEDA jupyterhub-home-nfs EBS volume full + rules: + - alert: jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" aws-ce-grafana-backend: enabled: true From cd84c51f678ec88213a04aaa03f27b0e36e5820c Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 18 Dec 2024 10:10:19 +0000 Subject: [PATCH 2/7] Update secret prometheus alertmanager config --- .../support/enc-support.secret.values.yaml | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/helm-charts/support/enc-support.secret.values.yaml b/helm-charts/support/enc-support.secret.values.yaml index 25f716d99..b00e7c3b8 100644 --- a/helm-charts/support/enc-support.secret.values.yaml +++ b/helm-charts/support/enc-support.secret.values.yaml @@ -1,20 +1,24 @@ grafana: - adminPassword: ENC[AES256_GCM,data:B+8/JqM4p6FytAp+8Ec4qzVKANZTVGuaHKMm7vyamhA2Wid5VBiRfIzghPXRxOakPtp8iTXqdO0I/AVu+HQDuA==,iv:ojVQ9u+cCGJeo/e8MbX4LKfKejXq/kjB2wtpjpNsCHA=,tag:Uggw1CS1+fmQdN0GsLhN9w==,type:str] -pagerduty-prometheus-integration: - integrationName: ENC[AES256_GCM,data:REEthqjPY3MEnA==,iv:KTals1+rPcGMAlT8zR5f6Y5RbhWuR9+QkxFQ59L7u/U=,tag:PynUx7lYUgcEmsLsnP+Z3g==,type:str] - integrationKey: ENC[AES256_GCM,data:RRwGXmQghrayoyHGnuPgnH7mkTA68m10ZHyCOn6fJMw=,iv:gLrxOnT6hvAKHSpC6x2mZxfHL/8O00QJU+BBMFo77S0=,tag:bE0YoHvelMLFDWUwSDDPvA==,type:str] - integrationUrl: ENC[AES256_GCM,data:+7kaZ0QGYlyrUpjBBDyZVvnZPIbyjBOUP8FP8DhwhVcSpIdOJn4Koq8NvVJxuTMBgbJiIn3JuVB1bkJCUIVqfW0=,iv:L5QgYp3IvKmwd1GZuv7n0nIXwcrtKeGx7wY+XSDvjjw=,tag:zFhdkN4XXmnaV8xyuW5+fQ==,type:str] + adminPassword: ENC[AES256_GCM,data:oNCAulbNDA7g4jJ3G9j7I5Uqd/XaKo5MvatryHVfat+2bjcqFOlpdHfiZsVZNgNP8kxMNwAlkeLJ7BGlFqYA2g==,iv:lJlTLyO9bvDp0zsE+dZQh1thfE7IfnABhAeynKLDUhA=,tag:jw3PN8OVr7+zl4ThdDlAAw==,type:str] +prometheus: + alertmanager: + config: + receivers: + - name: ENC[AES256_GCM,data:PuB35BjALacz,iv:39j9vTvzB1IB2pEZi+psoAv9FDMikOjrxps6+yxpLEQ=,tag:9apUjoupQkG6W9hsaZ6QHg==,type:str] + pagerduty_configs: + - ENC[AES256_GCM,data:X91fQd0gulxQkTtfMk48RK4EQ0gHJXrBaUeEGZmNYiGZWuGFm7Hf7Cj+yO4Fg51vzqMDwGIVhP3LH80PQbzjo96qJjzVJwAF7SK30eTj9iEcOg==,iv:ymurqszkJ2xB5dj66EAuytARi9mS2oKMdxfbnRzwgP4=,tag:4fYp5OKFiLm2usIqPNN1lQ==,type:comment] + - service_key: ENC[AES256_GCM,data:bCMB2VURBRRPvrV542HQMUGHs9kicvwcD0maVqxuH2I=,iv:NTewUsy4xZNsA9xWVr1Yd62Z2ubtOVMe86erwstXgu4=,tag:DfPqZP1fR/sTNJfVuaCSwA==,type:str] sops: kms: [] gcp_kms: - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2024-12-05T10:40:02Z" - enc: CiUA4OM7eJ4G+2WdZ9oThEMrrlJSek0IrnRDykmPQYXW99zmb8HQEkkAnGhyNt6qUGehjrI1ovZKXU0p+cr5JwiFBA8PQs/3FR4Rdcy1ZX45Ed0sModB4VvWQlG9WgMC+75fzStuUUZUDtGsV9xvZeM0 + created_at: "2024-12-18T10:09:21Z" + enc: CiUA4OM7eOThrN3NqDXYd4belMxUdYfF8XpeUSYfiefKkYutN2KoEkkAnGhyNqzzrXyUXTvtS4xl4IjlJpo6hwm9FVbQAyh2Vw9dcXH6h1+NNUHGPYj6KQJvvWZzgXHFLiWdqOfUgTJME0YDx/DYgXgm azure_kv: [] hc_vault: [] age: [] - lastmodified: "2024-12-05T10:40:02Z" - mac: ENC[AES256_GCM,data:6rssn9qvZh2ekypkcMmGhoNwGjhGqs85Ld1R1zkIpwx670rTubJlvmFsyslGuqU4JJJ1aJ8A/KKtlMJMjhyGpLGA1krKWf9tgV0E3G/GAk2kR6h/lbdCfPhR+DsHdOAlWuHZTFz2bJuBKboi1znoNoOSKI+32WU6kFNnkcEqgTI=,iv:EAfU8mGbWegtCqu0M4DAjy29cEOJBMgM8/32vRAZf4Y=,tag:E0FtNKt80cRcw9kRXMjW7w==,type:str] + lastmodified: "2024-12-18T10:09:21Z" + mac: ENC[AES256_GCM,data:FLVIB1iZCSfuCwAiL0wBVHik2kO2xfR9iV0TgW2skPWMNJA+RWPhyHk8Ma3SsH/7iSOR8acwrFZ0b6DBhAtTtQEV1DGdSm9LvdHE2hkG7oFZzkdXYaC2YcwwkVJzfEmwZIn7OECXTL2W2I/5ZUEcs0bUW/5YlKmMzphtt9DL5o4=,iv:NdMEKBn90DyGgkrNYG8VKhhe4S+H57XM6AgAqJWA1q4=,tag:0lLWwN5M9jXmV8EzP/Y7nw==,type:str] pgp: [] unencrypted_suffix: _unencrypted version: 3.9.1 From 2ae29990fbcb027a0848551f44c13a2413716d81 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Wed, 18 Dec 2024 10:33:04 +0000 Subject: [PATCH 3/7] Add cluster name as a label so this infor gets passed to PagerDuty The name of the rulegroup doesn't get sent to PagerDuty unfortunately --- config/clusters/nasa-veda/support.values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml index 0e8c9baf6..7591a19b0 100644 --- a/config/clusters/nasa-veda/support.values.yaml +++ b/config/clusters/nasa-veda/support.values.yaml @@ -64,6 +64,7 @@ prometheus: labels: severity: critical channel: pagerduty + cluster: nasa-veda annotations: summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" From 1a081dbd406e2fc42d7af52fd1e9de78bf8e6b06 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 19 Dec 2024 12:48:39 +0530 Subject: [PATCH 4/7] document how to enable prometheus alerts --- docs/howto/features/storage-quota.md | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index 88454ad5e..c12748467 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -138,6 +138,59 @@ deployer deploy Once this is deployed, the hub will automatically enforce the storage quota for each user. If a user's home directory exceeds the quota, the user's pod may not be able to start successfully. +## Enabling alerting through Prometheus Alertmanager + +Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action. + +To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. + +First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)). + +```yaml +prometheus: + alertmanager: + enabled: true +``` + +Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file: + +```yaml +prometheus: + serverFiles: + alerting_rules.yml: + groups: + - name: jupyterhub-home-nfs EBS volume full + rules: + - alert: jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" +``` + +And finally, we need to configure Alertmanager to send alerts to PagerDuty. + +```yaml +prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty +``` + + ## Troubleshooting ### Checking the NFS server is running properly From 14d6e112966ecf3c6e2b66860c087987a0e1afd4 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 19 Dec 2024 12:51:54 +0530 Subject: [PATCH 5/7] document how to increase AWS EBS volume size --- docs/howto/features/storage-quota.md | 3 +++ .../filesystem-management/increase-size-aws-ebs.md | 12 ++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 docs/howto/filesystem-management/increase-size-aws-ebs.md diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index c12748467..d9b0da3dc 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -190,6 +190,9 @@ prometheus: channel: pagerduty ``` +## Increasing the size of the volume used by the NFS server + +If the volume used by the NFS server is close to being full, we may need to increase the size of the volume. This can be done by following the instructions in the [Increase the size of an AWS EBS volume](howto:increase-size-aws-ebs) guide. ## Troubleshooting diff --git a/docs/howto/filesystem-management/increase-size-aws-ebs.md b/docs/howto/filesystem-management/increase-size-aws-ebs.md new file mode 100644 index 000000000..35697cd1a --- /dev/null +++ b/docs/howto/filesystem-management/increase-size-aws-ebs.md @@ -0,0 +1,12 @@ +(howto:increase-size-aws-ebs)= +# Increase the size of an AWS EBS volume + +To increase the size of an AWS EBS volume, we need to increase the size of the EBS volume in the [tfvars file of the hub](https://github.com/2i2c-org/infrastructure/tree/main/terraform/aws/projects): + +For example, to increase the size of the EBS volume used by `jupyterhub-home-nfs` for the `staging` hub in the `nasa-veda` cluster, we would increase the `size` parameter in the `ebs_volumes` block for the `staging` hub in the [tfvars file for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/terraform/aws/projects/nasa-veda.tfvars). + +The EBS volume will be resized automatically when the hub is deployed. + +```{note} +The size of an EBS volume can only be increased, not decreased. +``` From 7eb2f3d74722df0c3bcdcb96d602d0b04031f8e2 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 19 Dec 2024 13:00:52 +0530 Subject: [PATCH 6/7] include new docs page in index --- docs/howto/filesystem-management/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/howto/filesystem-management/index.md b/docs/howto/filesystem-management/index.md index 412b4d929..49ac69d6e 100644 --- a/docs/howto/filesystem-management/index.md +++ b/docs/howto/filesystem-management/index.md @@ -7,4 +7,5 @@ This documentation covers tasks related to managing filesystems. :maxdepth: 2 filesystem-backups/index decrease-size-gcp-filestore +increase-size-aws-ebs ``` From 08d9d996bdfe603e7dbcd2de36a5f4e4f6845484 Mon Sep 17 00:00:00 2001 From: Tarashish Mishra Date: Thu, 19 Dec 2024 17:03:05 +0530 Subject: [PATCH 7/7] Update the docs to specify that terraform changes should be applied locally --- docs/howto/filesystem-management/increase-size-aws-ebs.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/howto/filesystem-management/increase-size-aws-ebs.md b/docs/howto/filesystem-management/increase-size-aws-ebs.md index 35697cd1a..c6b0a514e 100644 --- a/docs/howto/filesystem-management/increase-size-aws-ebs.md +++ b/docs/howto/filesystem-management/increase-size-aws-ebs.md @@ -5,7 +5,13 @@ To increase the size of an AWS EBS volume, we need to increase the size of the E For example, to increase the size of the EBS volume used by `jupyterhub-home-nfs` for the `staging` hub in the `nasa-veda` cluster, we would increase the `size` parameter in the `ebs_volumes` block for the `staging` hub in the [tfvars file for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/terraform/aws/projects/nasa-veda.tfvars). -The EBS volume will be resized automatically when the hub is deployed. +After updating the tfvars file, we need to plan and apply the changes using terraform: + + ```bash + cd terraform/aws + terraform plan -var-file=projects/$CLUSTER_NAME.tfvars + terraform apply -var-file=projects/$CLUSTER_NAME.tfvars + ``` ```{note} The size of an EBS volume can only be increased, not decreased.