From d381587a9e5f6ba7d1e168ffa7b3198ac9dc90a1 Mon Sep 17 00:00:00 2001 From: Michael Walker Date: Wed, 31 Jul 2024 16:34:06 +0100 Subject: [PATCH] Add an alert and a runbook for low disk space I've picked 10% remaining free space as the threshold. --- docs/src/SUMMARY.md | 1 + docs/src/runbooks/alerts/diskspacelow.md | 35 ++++++++++++++++++++++++ shared/default.nix | 24 +++++++++------- 3 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 docs/src/runbooks/alerts/diskspacelow.md diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index dd65912a..6ebc5b33 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -10,6 +10,7 @@ # Alert Runbooks +- [DiskSpaceLow](./runbooks/alerts/diskspacelow.md) - [ZPoolStatusDegraded](./runbooks/alerts/zpoolstatusdegraded.md) # "How to" Runbooks diff --git a/docs/src/runbooks/alerts/diskspacelow.md b/docs/src/runbooks/alerts/diskspacelow.md new file mode 100644 index 00000000..aae75e04 --- /dev/null +++ b/docs/src/runbooks/alerts/diskspacelow.md @@ -0,0 +1,35 @@ +DiskSpaceLow +============ + +This alert fires when a partition has under 10% free space remaining. + +The alert will say which partitions are affected, `df -h` also has the +information: + +``` +$ df -h +Filesystem Size Used Avail Use% Mounted on +devtmpfs 1.6G 0 1.6G 0% /dev +tmpfs 16G 112K 16G 1% /dev/shm +tmpfs 7.8G 9.8M 7.8G 1% /run +tmpfs 16G 1.1M 16G 1% /run/wrappers +local/volatile/root 1.7T 1.8G 1.7T 1% / +local/persistent/nix 1.7T 5.1G 1.7T 1% /nix +local/persistent/persist 1.7T 2.0G 1.7T 1% /persist +local/persistent/var-log 1.7T 540M 1.7T 1% /var/log +efivarfs 128K 40K 84K 33% /sys/firmware/efi/efivars +local/persistent/home 1.7T 32G 1.7T 2% /home +/dev/nvme0n1p2 487M 56M 431M 12% /boot +data/nas 33T 22T 11T 68% /mnt/nas +tmpfs 3.2G 12K 3.2G 1% /run/user/1000 +``` + +Note all ZFS datasets in the same pool (`local/*` and `data/*` in the example +above) share the underlying storage. + +Debugging steps: + +- See the `node_filesystem_avail_bytes` metric for how quickly disk space is + being consumed +- Use `ncdu -x` to work out where the space is going +- Buy more storage if need be diff --git a/shared/default.nix b/shared/default.nix index 2573a146..0db6771f 100644 --- a/shared/default.nix +++ b/shared/default.nix @@ -146,16 +146,6 @@ in services.zfs.autoSnapshot.enable = thereAreZfsFilesystems; services.zfs.autoSnapshot.monthly = 3; - services.prometheus.rules = mkIf thereAreZfsFilesystems [ - '' - groups: - - name: zfs - rules: - - alert: ZPoolStatusDegraded - expr: node_zfs_zpool_state{state!="online"} > 0 - '' - ]; - # Actually panic when ZFS "panics" # https://utcc.utoronto.ca/~cks/space/blog/linux/ZFSPanicsNotKernelPanics boot.extraModprobeConfig = mkIf thereAreZfsFilesystems '' @@ -273,6 +263,20 @@ in }; }; + services.prometheus.rules = [ + '' + groups: + - name: disk + rules: + - alert: DiskSpaceLow + expr: node_filesystem_avail_bytes{fstype!~"(ramfs|tmpfs)"} / node_filesystem_size_bytes < 0.1 + - name: zfs + rules: + - alert: ZPoolStatusDegraded + expr: node_zfs_zpool_state{state!="online"} > 0 + '' + ]; + # Host metrics services.prometheus.exporters.node.enable = promcfg.enable;