From 0ded266cb0bf11395990a341c0fb27b91cfe3733 Mon Sep 17 00:00:00 2001 From: Jaskaran Sarkaria Date: Wed, 28 Feb 2024 15:54:03 +0000 Subject: [PATCH] =?UTF-8?q?ci:=20=F0=9F=8E=A1=20add=20ingress=20alerts=20(?= =?UTF-8?q?#78)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci: 🎡 add ingress alerts * terraform-docs: automated action --------- Co-authored-by: github-actions[bot] --- README.md | 1 + main.tf | 9 +++++++++ resources/alerts.yaml | 30 ++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 resources/alerts.yaml diff --git a/README.md b/README.md index 52b30ac..996b938 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ No modules. |------|------| | [helm_release.nginx_ingress](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [kubectl_manifest.nginx_ingress_default_certificate](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.prometheus_rule_alert](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubernetes_config_map.fluent-bit-config](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | | [kubernetes_config_map.fluent_bit_lua_script](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | | [kubernetes_config_map.logrotate_config](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource | diff --git a/main.tf b/main.tf index a02f93c..6039752 100644 --- a/main.tf +++ b/main.tf @@ -101,3 +101,12 @@ resource "kubectl_manifest" "nginx_ingress_default_certificate" { kubernetes_namespace.ingress_controllers ] } + +######################### +# prometheus rule alert # +######################### +resource "kubectl_manifest" "prometheus_rule_alert" { + count = var.controller_name == "default" ? 1 : 0 + depends_on = [helm_release.nginx_ingress] + yaml_body = file("${path.module}/resources/alerts.yaml") +} diff --git a/resources/alerts.yaml b/resources/alerts.yaml new file mode 100644 index 0000000..b70d7e2 --- /dev/null +++ b/resources/alerts.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ingress-controller-errors + namespace: ingress-controllers + labels: + prometheus: cloud-platform +spec: + groups: + - name: ingress-controllers + rules: + - alert: IngressControllerIsCrashLoopBackoffing + expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace="ingress-controllers"}[15m]) * 60 * 15 > 0 + for: 10m + labels: + severity: warning + annotations: + message: An Ingress Controller pod is CrashLoopBackOff'ing + - alert: IngressControllerIsOOMKilled + expr: |- + kube_pod_container_status_last_terminated_reason{container="controller",namespace="ingress-controllers",reason="OOMKilled"} == 1 + and on(container, namespace, pod) increase(kube_pod_container_status_restarts_total{container="controller",namespace="ingress-controllers"}[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + message: | + An Ingress Controller pod has restarted because of OOMKilled. This alert works by watching for a pod that has been restarted within 5 minutes and the last termination status is OOMKilled. +