From 85fde54385ce5613c59a4126bcf7150bbe85917e Mon Sep 17 00:00:00 2001 From: santhosh-tg <93243580+santhosh-tg@users.noreply.github.com> Date: Mon, 10 Jun 2024 10:25:05 +0530 Subject: [PATCH] Fix monitoring alerts (#4009) --- .../monitoring/alertrules/templates/custom_promrules_k8s.yml | 2 +- .../monitoring/alertrules/templates/promrulesNode.yml | 4 ++-- .../templates/prometheus/rules-1.14/general.rules.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/custom_promrules_k8s.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/custom_promrules_k8s.yml index 9ac3fabd67..f6eaf4fd01 100644 --- a/kubernetes/helm_charts/monitoring/alertrules/templates/custom_promrules_k8s.yml +++ b/kubernetes/helm_charts/monitoring/alertrules/templates/custom_promrules_k8s.yml @@ -15,7 +15,7 @@ spec: - alert: TargetDown annotations: message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' - expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + expr: 100 * (count(up == 0) BY (namespace, service) / count(up) BY (namespace, service)) > 10 for: 10m labels: severity: critical diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesNode.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesNode.yml index d36e7d6823..309502a509 100644 --- a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesNode.yml +++ b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesNode.yml @@ -85,12 +85,12 @@ spec: message: {{`'{{ $labels.nodename }} ({{ $labels.host }}) has a high load average. Load average is {{ $value }}%.'`}} summary: {{`'HIGH LOAD AVERAGE warning ON {{ $labels.nodename }}'`}} - alert: node_exporter_down_critical - expr: up == 0 + expr: up{job="vm-node-exporter"} == 0 for: 1m labels: severity: critical annotations: - message: {{`The node exporter '{{ $labels.job }}' is down.`}} + message: {{`'The node exporter '{{ $labels.job }}' is down.'`}} summary: {{`'NODE EXPORTER SERVICE critical: NODE ''{{ $labels.host }}'''`}} - alert: node_running_out_of_disk_space_warning expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ .Values.node_disk_usage_percentage_threshold_Warning }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ .Values.node_disk_usage_percentage_threshold_Critical }} ) diff --git a/kubernetes/helm_charts/monitoring/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml b/kubernetes/helm_charts/monitoring/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml index 8139fcaed9..ce8dbd1795 100644 --- a/kubernetes/helm_charts/monitoring/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml +++ b/kubernetes/helm_charts/monitoring/prometheus-operator/templates/prometheus/rules-1.14/general.rules.yaml @@ -36,7 +36,7 @@ spec: description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown summary: One or more targets are unreachable. - expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 + expr: 100 * (count(up == 0) BY (cluster,namespace, service) / count(up) BY (cluster,namespace, service)) > 10 for: 10m {{- with .Values.defaultRules.keepFiringFor }} keep_firing_for: "{{ . }}" @@ -122,4 +122,4 @@ spec: {{- end }} {{- end }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }}