From 4ce71bed14666083a942d4e95efd9a3d02d38417 Mon Sep 17 00:00:00 2001 From: jaskaransarkaria Date: Wed, 28 Feb 2024 12:35:01 +0000 Subject: [PATCH] =?UTF-8?q?ci:=20=F0=9F=8E=A1=20add=20ingress=20alerts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.tf | 9 +++++++++ resources/alerts.yaml | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 resources/alerts.yaml diff --git a/main.tf b/main.tf index a02f93c..6039752 100644 --- a/main.tf +++ b/main.tf @@ -101,3 +101,12 @@ resource "kubectl_manifest" "nginx_ingress_default_certificate" { kubernetes_namespace.ingress_controllers ] } + +######################### +# prometheus rule alert # +######################### +resource "kubectl_manifest" "prometheus_rule_alert" { + count = var.controller_name == "default" ? 1 : 0 + depends_on = [helm_release.nginx_ingress] + yaml_body = file("${path.module}/resources/alerts.yaml") +} diff --git a/resources/alerts.yaml b/resources/alerts.yaml new file mode 100644 index 0000000..b70d7e2 --- /dev/null +++ b/resources/alerts.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ingress-controller-errors + namespace: ingress-controllers + labels: + prometheus: cloud-platform +spec: + groups: + - name: ingress-controllers + rules: + - alert: IngressControllerIsCrashLoopBackoffing + expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace="ingress-controllers"}[15m]) * 60 * 15 > 0 + for: 10m + labels: + severity: warning + annotations: + message: An Ingress Controller pod is CrashLoopBackOff'ing + - alert: IngressControllerIsOOMKilled + expr: |- + kube_pod_container_status_last_terminated_reason{container="controller",namespace="ingress-controllers",reason="OOMKilled"} == 1 + and on(container, namespace, pod) increase(kube_pod_container_status_restarts_total{container="controller",namespace="ingress-controllers"}[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + message: | + An Ingress Controller pod has restarted because of OOMKilled. This alert works by watching for a pod that has been restarted within 5 minutes and the last termination status is OOMKilled. +