From 1eb467283c274f58cbc93fadcf991a0ea7657653 Mon Sep 17 00:00:00 2001
From: kaliraja <34502260+Kaali09@users.noreply.github.com>
Date: Fri, 16 Apr 2021 10:12:45 +0530
Subject: [PATCH] TG-910: feat: custom email template for alerts (#2400)
* Issue #added custom subject feature
* Issue #corrected the indentation
* Issue #added the custom html file
* Issue #added the template config
* Issue #corrected the template error
* Issue #corrected the template error
* Issue # removed the slakc title link
* Issue # added the api response latency alert rules.
* Issue # added the api response latency alert rules.
* Issue # added the api response latency alert rules.
* Issue # added the api response latency alert rules.
* Issue # added the api response variable
* Issue # TG-910 corrected the data type error
* Issue # TG-910 corrected the slack title
* Issue # TG-910 removed the unwanted templates
* Issue # TG-910 removed the nodename and instance name from template
* Issue # TG-910 updated the var
* Issue # TG-910 updated ansible var
* Issue # TG-910 added the description for vars.
---
.../sunbird-monitoring/defaults/main.yml | 3 +
.../files/sunbird_alert_html.tmpl | 124 ++++++++++++++++++
.../templates/alertrules.yaml | 89 ++++++++++++-
.../templates/prometheus-operator.yaml | 34 +++--
.../templates/promrulesApiResponse.yml | 29 ++++
5 files changed, 265 insertions(+), 14 deletions(-)
create mode 100644 kubernetes/ansible/roles/sunbird-monitoring/files/sunbird_alert_html.tmpl
create mode 100644 kubernetes/helm_charts/monitoring/alertrules/templates/promrulesApiResponse.yml
diff --git a/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml b/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml
index 0726c36c29..f96b771f02 100644
--- a/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml
+++ b/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml
@@ -240,6 +240,7 @@ ignore_alert_list: #### This is the var being used to skip the list of alerts.
- KubeSchedulerDown
- KubeAPIDown
- KubeStateMetricsDown
+ - PrometheusDuplicateTimestamps
igonore_alert_list_custom: [] #### This var can be used for custom related. For example if an adopter want to add any alerts to be skipped then those list can be added here instead of taking entire list and overriding the same by adding in the private repo.
# for example refer the below snippet:
@@ -266,3 +267,5 @@ service_health_checks:
targets: "http://lms-service.{{ namespace }}.svc.cluster.local:9000/health"
- service_name: 'learner'
targets: "http://learner-service.{{ namespace }}.svc.cluster.local:9000/health"
+
+api_response_upward_trend_threshold: 0.3
diff --git a/kubernetes/ansible/roles/sunbird-monitoring/files/sunbird_alert_html.tmpl b/kubernetes/ansible/roles/sunbird-monitoring/files/sunbird_alert_html.tmpl
new file mode 100644
index 0000000000..1c01b9bdd8
--- /dev/null
+++ b/kubernetes/ansible/roles/sunbird-monitoring/files/sunbird_alert_html.tmpl
@@ -0,0 +1,124 @@
+{{ define "email.sunbird.html" }}
+
+
+
+
+
+
+{{ template "__subject" . }}
+
+
+
+
+
+
+
+ |
+
+
+
+
+
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
+ {{ .Name }}={{ .Value }}
+ {{ end }}
+ |
+
+
+
+
+
+
+ View in {{ template "__alertmanager" . }}
+ |
+
+ {{ if gt (len .Alerts.Firing) 0 }}
+
+
+ |
+
+ {{ end }}
+ {{ range .Alerts.Firing }}
+
+
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
+ {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
+ {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
+ |
+
+ {{ end }}
+
+ {{ if gt (len .Alerts.Resolved) 0 }}
+ {{ if gt (len .Alerts.Firing) 0 }}
+
+
+
+
+
+ |
+
+ {{ end }}
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved
+ |
+
+ {{ end }}
+ {{ range .Alerts.Resolved }}
+
+
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
+ {{ if gt (len .Annotations) 0 }}Annotations {{ end }}
+ {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }} {{ end }}
+ Source
+ |
+
+ {{ end }}
+
+ |
+
+
+
+
+ |
+ |
+
+
+
+
+
+
+{{ end }}
diff --git a/kubernetes/ansible/roles/sunbird-monitoring/templates/alertrules.yaml b/kubernetes/ansible/roles/sunbird-monitoring/templates/alertrules.yaml
index ba74c9e9b6..4b82c47210 100644
--- a/kubernetes/ansible/roles/sunbird-monitoring/templates/alertrules.yaml
+++ b/kubernetes/ansible/roles/sunbird-monitoring/templates/alertrules.yaml
@@ -85,5 +85,92 @@ secor_process_names:
redis_process_names:
{{ redis_process_names | to_yaml | indent( width=2) }}
-service_health_checks:
+service_health_checks: # This var is to check the each service health and send the alert if any service health is false.
{{ service_health_checks | to_yaml | indent( width=2) }}
+
+# This var is being used in alert rules and if rate of increase in any api response time greater than threshold then will trigger an alert.
+api_response_upward_trend_threshold: "{{ api_response_upward_trend_threshold }}"
+api_names:
+ - getUserProfileV2
+ - collectionSummaryAgg
+ - echo
+ - sendEmailNotification
+ - listReports
+ - userExistenceApiV2
+ - uploadTOC
+ - submitDataExhaustRequest
+ - downloadTOC
+ - getReport
+ - contentImportAPI
+ - publishContent
+ - channelSpecificTelemetryExhaust
+ - listUserCourseEnrollments
+ - readCertTemplate
+ - createLock
+ - retireContent
+ - downloadRegCertificateV2
+ - createUserLearnerV3
+ - updateDesktopApp
+ - createUserVersion3
+ - createContent
+ - deviceProfile
+ - assemblePage
+ - createUserVersion4
+ - groupActivityAgg
+ - courseBatchAddCertificateTemplate
+ - readFramework
+ - listDataExhaustRequest
+ - readChannel
+ - listCourseEnrollments
+ - compositeSearch
+ - searchContent
+ - getCourseHierarchy
+ - dialAssemble
+ - retireLock
+ - readForm
+ - downloadRegCertificate
+ - updateUser
+ - updateGroup
+ - readGroup
+ - courseUnEnrolment
+ - readContentState
+ - courseEnrolment
+ - getUserProfile
+ - updateContentState
+ - PrivateContentReadAPIs
+ - getUserProfileV3
+ - createGroup
+ - updateUserDeclarations
+ - readContent
+ - generateOtp
+ - updateUserConsent
+ - listLock
+ - mergeUserAccounts
+ - registerMobileDevice
+ - getUserByKey
+ - acceptTermsAndCondition
+ - verifyOtp
+ - searchUser
+ - searchRegCertificate
+ - searchCourseBatches
+ - readUserConsent
+ - listGroup
+ - getReportSummary
+ - masterLocationSearch
+ - updateGroupMembership
+ - searchManagedUser
+ - userExistenceApi
+ - searchOrgExtended
+ - userFeed
+ - validateRegCertificate
+ - readRoleMapping
+ - searchOrg
+ - telemetry
+ - readOrg
+ - getBatch
+ - registerMobileDevicev2
+ - refreshToken
+ - registerDesktopDevicev2
+ - deviceRegister
+ - getSystemSettings
+ - telemetryErrorLogging
diff --git a/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml b/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml
index 197aa9d4ee..a1559e26ae 100644
--- a/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml
+++ b/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml
@@ -135,6 +135,8 @@ alertmanager:
api_url: "{{ monitor_alerts_slack_url }}"
username: 'Monitor - Alerter'
channel: "{{ monitor_alerts_slack_channel }}"
+ title_link: ""
+ title: '{% raw %}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]{% endraw %}'
text: |-
{{ "{{" }} range .Alerts {{ "}}" }}
*Alert:* {{ "{{" }} .Annotations.message {{ "}}" }} - `{{ "{{" }} .Labels.severity {{ "}}" }}`
@@ -165,9 +167,9 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ default_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
- name: 'dp-lag_slack_warning'
slack_configs:
@@ -190,9 +192,9 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ default_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
- name: 'dp-lag_slack_critical'
slack_configs:
@@ -215,9 +217,9 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ default_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
- name: 'null'
{% for item in alert_teams %}
@@ -226,9 +228,9 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ item.alerts_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
{% if item.severity_mailing_filter is defined and item.severity_mailing_filter|length %}
{% for filter in item.severity_mailing_filter %}
# Comment to ensure proper indentation while templating
@@ -236,9 +238,9 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ filter.alerts_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
{% endfor %}
{% endif %}
{% endfor %}
@@ -247,10 +249,16 @@ alertmanager:
email_configs:
- send_resolved: true
to: '{{ default_mailing_list }}'
- html: '{% raw %}{{ template "email.default.html" . }}{% endraw %}'
+ html: '{% raw %}{{ template "email.sunbird.html" . }}{% endraw %}'
headers:
- subject: '[{{ kubernetes_cluster_name }}] {% raw %}{{ .GroupLabels.alertname }}{% endraw %}'
-
+ subject: '{% raw %}{{ template "email.sunbird.subject" . }}{% endraw %}'
+ templates:
+ - '/etc/alertmanager/config/*.tmpl'
+ templateFiles:
+ sunbird_alert_subject.tmpl: |-
+{% raw %} {{ define "email.sunbird.subject" }}[{{ .Status | toUpper }}] [{{ .CommonLabels.severity }}] [{{ .CommonLabels.env }}] [{{ .CommonLabels.alertname }}] {{ end }}{% endraw %}{''}
+ sunbird_alert_html.tmpl: |-
+ "{{ lookup('file', '../files/sunbird_alert_html.tmpl') | indent( width=8) }}"
# Adding alertmanager custom spec overrides
# Refrencing alert manager yaml anchor
<<: *alertmanager_spec_overrides
diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesApiResponse.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesApiResponse.yml
new file mode 100644
index 0000000000..a9475212d2
--- /dev/null
+++ b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesApiResponse.yml
@@ -0,0 +1,29 @@
+## description: This is the alertrules to trigger the alert if there is upward trend in avg response time.
+
+---
+{{- $api_response_upward_trend_threshold := .Values.api_response_upward_trend_threshold -}}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ labels:
+ role: alert-rules
+ app: {{ .Values.prometheus_rule_selector_app }}
+ release: {{ .Values.prometheus_rule_selector_release }}
+ name: {{ .Values.fullnameOverride }}-api-response-rules
+ namespace: {{ .Values.namespace }}
+spec:
+ groups:
+ - name: alertrules.api.response
+ rules:
+ {{- if .Values.api_names }}
+ ## druid segment unavailable count alert
+ {{ range $key := .Values.api_names }}
+ - alert: api_response_upward_trend {{ . }}
+ expr: irate(kong_upstream_latency_time_sum{api={{ . | quote }} }[5m]) > {{ $api_response_upward_trend_threshold }}
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ message: There is an upward trend in this api {{ . }} avg response time.
+ {{- end }}
+ {{- end }}