This repository has been archived by the owner on Jun 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alerterator.yml
56 lines (51 loc) · 3 KB
/
alerterator.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
apiVersion: nais.io/v1
kind: Alert
metadata:
name: {{app}}-alerts
labels:
team: {{team}}
namespace: {{namespace}}
spec:
receivers:
slack:
channel: {{slack-channel}}
prependText: "{{{slack-notify-type}}}"
alerts:
- alert: Applikasjon nede
severity: danger
expr: kube_deployment_status_replicas_available{deployment="{{app}}"} == 0
for: 2m
description: "App \{{ $labels.app }} er nede i namespace \{{ $labels.kubernetes_namespace }}"
action: "`kubectl describe pod \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }}` for events, og `kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }}` for logger"
- alert: høy andel error i logger
severity: danger
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="{{app}}",log_level=~"Error"}[3m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="{{app}}"}[3m]))) > 0
for: 3m
action: "Sjekk loggene til app \{{ $labels.log_app }} i namespace \{{ $labels.log_namespace }}, for å se hvorfor det er så mye feil"
- alert: høy andel warning i logger
severity: warning
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="{{app}}",log_level=~"Warning"}[3m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="{{app}}"}[3m]))) > 10
for: 3m
action: "Sjekk loggene til app \{{ $labels.log_app }} i namespace \{{ $labels.log_namespace }}, for å se hvorfor det er så mye warnings"
- alert: Høy andel HTTP serverfeil (5xx responser)
severity: danger
expr: floor(increase(ktor_http_server_requests_seconds_count{result="failure", status=~"5.*", app="{{app}}"}[3m])) > 1
for: 1m
description: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n
Grunn:\n ```\{{ $labels.problem_details }}```\n
Sjekk loggene for å se hvorfor dette feiler."
action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }}`"
- alert: Høy andel HTTP klientfeil (4xx responser)
severity: danger
expr: floor(increase(ktor_http_server_requests_seconds_count{result="failure", status=~"4.*", status!~"404|401|403", app="{{app}}"}[3m])) > 0
for: 1m
description: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n
Grunn:\n ```\{{ $labels.problem_details }}```\n
Sjekk loggene for å se hvorfor dette feiler"
action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }}`"
- alert: Helsesjekk feiler
expr: health_check_status{app="{{app}}"} > 0
severity: warning
for: 2m
desription: "Sjekk loggene for å se hvorfor helsesjekken feiler.`"
action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }}`"