Skip to content

Commit

Permalink
Fix monitoring alerts (#4009)
Browse files Browse the repository at this point in the history
  • Loading branch information
santhosh-tg authored Jun 10, 2024
1 parent 543bc9c commit 85fde54
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
- alert: TargetDown
annotations:
message: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
expr: 100 * (count(up == 0) BY (namespace, service) / count(up) BY (namespace, service)) > 10
for: 10m
labels:
severity: critical
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ spec:
message: {{`'{{ $labels.nodename }} ({{ $labels.host }}) has a high load average. Load average is {{ $value }}%.'`}}
summary: {{`'HIGH LOAD AVERAGE warning ON {{ $labels.nodename }}'`}}
- alert: node_exporter_down_critical
expr: up == 0
expr: up{job="vm-node-exporter"} == 0
for: 1m
labels:
severity: critical
annotations:
message: {{`The node exporter '{{ $labels.job }}' is down.`}}
message: {{`'The node exporter '{{ $labels.job }}' is down.'`}}
summary: {{`'NODE EXPORTER SERVICE critical: NODE ''{{ $labels.host }}'''`}}
- alert: node_running_out_of_disk_space_warning
expr: sum by(nodename) (((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) >= {{ .Values.node_disk_usage_percentage_threshold_Warning }} and ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) * 100 / node_filesystem_size_bytes{mountpoint="/"} * on(instance) group_left(nodename) node_uname_info) < {{ .Values.node_disk_usage_percentage_threshold_Critical }} )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ spec:
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
runbook_url: {{ .Values.defaultRules.runbookUrl }}/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
expr: 100 * (count(up == 0) BY (cluster,namespace, service) / count(up) BY (cluster,namespace, service)) > 10
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
Expand Down Expand Up @@ -122,4 +122,4 @@ spec:
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

0 comments on commit 85fde54

Please sign in to comment.