diff --git a/src/prometheus_alert_rules/host_health.rules b/src/prometheus_alert_rules/host_health.rules new file mode 100644 index 0000000..803e2e5 --- /dev/null +++ b/src/prometheus_alert_rules/host_health.rules @@ -0,0 +1,25 @@ +groups: +- name: HostHealth + rules: + - alert: HostDown + expr: up < 1 + for: 5m + labels: + severity: critical + annotations: + summary: Host '{{labels.instance}}' is down. + description: >- + Host '{{ $labels.instance }}' is down. + VALUE = {{ $value }} + LABELS = {{ $labels }} + - alert: HostUnavailable + expr: absent(up) + for: 5m + labels: + severity: critical + annotations: + summary: Metrics not received from host '{{labels.instance}}'. + description: >- + The metrics endpoint for host '{{ $labels.instance }}' is unreachable. + VALUE = {{ $value }} + LABELS = {{ $labels }}