Skip to content

Commit

Permalink
Dp monitoring fixes (#1785)
Browse files Browse the repository at this point in the history
* Issue #0000 feat: moving dp alerts to flink channel

* Issue #0000 feat: updated the alertrules

* Issue #0000 feat: corrected the syntax error
  • Loading branch information
Kaali09 authored Aug 7, 2020
1 parent 144d8d2 commit 1bd2969
Show file tree
Hide file tree
Showing 18 changed files with 617 additions and 237 deletions.
8 changes: 6 additions & 2 deletions kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
fullnameOverride: sunbird-monitoring
monitoring_stack:
- prometheus-operator
- kafka-lag-exporter
- cassandra-jmx-exporter
- elasticsearch-exporter
- logger
Expand All @@ -25,6 +24,8 @@ monitor_alerts_slack_url:
monitor_alerts_slack_channel:
env: dev
kubernetes_cluster_name: kubernetes-1
dp_monitor_alerts_slack_channel: "{{ env_name }}_flink_alerts"
dp_monitor_alerts_slack_url: "{{ dp_vault_monitor_alerts_slack_url }}"

default_critical_mailing_list: "{{ default_mailing_list }}"
default_fatal_mailing_list: "{{ default_mailing_list }}"
Expand Down Expand Up @@ -169,4 +170,7 @@ kafka_telemetry_duplicate_backup_threshold: 1000

### kafka exporters related vars
processing_cluster_zookeeper: "{{ groups['processing-cluster-zookeepers'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:2181') | list}}"
processing_cluster_kafka: "{{ groups['processing-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}"
processing_cluster_kafka: "{{ groups['processing-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}"

ingestion_cluster_zookeeper: "{{ groups['ingestion-cluster-zookeeper'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:2181') | list}}"
ingestion_cluster_kafka: "{{ groups['ingestion-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
kafkaExporter:
zookeeper:
servers: ["{{ ingestion_cluster_zookeeper | join('","') }}"]
kafka:
servers: ["{{ ingestion_cluster_kafka | join('","') }}"]
additionalFlags:
- --use.consumelag.zookeeper

prometheus:
serviceMonitor:
enabled: true
namespace: monitoring
interval: "120s"
scrapeTimeout: "90s"
additionalLabels:
app: prometheus-operator
release: prometheus-operator
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ alertmanager:
routes:
- receiver: slack
continue: true
- match:
module: dp
receiver: dp-slack
{% for item in alert_teams %}
# Comment to ensure proper indentation while templating
- match_re:
Expand Down Expand Up @@ -82,6 +85,21 @@ alertmanager:
{{ "{{" }} end {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
icon_emoji: ':dart:'
- name: 'dp-slack'
slack_configs:
- send_resolved: true
api_url: "{{ dp_monitor_alerts_slack_url }}"
username: 'Monitor - Alerter'
channel: "{{ dp_monitor_alerts_slack_channel }}"
text: |-
{{ "{{" }} range .Alerts {{ "}}" }}
*Alert:* {{ "{{" }} .Annotations.message {{ "}}" }} - `{{ "{{" }} .Labels.severity {{ "}}" }}`
*Description:* {{ "{{" }} .Annotations.message {{ "}}" }}
*Details:*
{{ "{{" }} range .Labels.SortedPairs {{ "}}" }} • *{{ "{{" }} .Name {{ "}}" }}:* `{{ "{{" }} .Value {{ "}}" }}`
{{ "{{" }} end {{ "}}" }}
{{ "{{" }} end {{ "}}" }}
icon_emoji: ':dart:'
{% for item in alert_teams %}
# Comment to ensure proper indentation while templating
- name: "{{ item.team }}"
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ spec:
for: 1m
labels:
severity: critical
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: More than one process running
Expand All @@ -57,6 +58,7 @@ spec:
for: 1m
labels:
severity: critical
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Secor process is not running
Expand All @@ -65,6 +67,7 @@ spec:
for: 1m
labels:
severity: critical
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Secor process is not running
Expand All @@ -73,6 +76,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Zookeeper process is not running
Expand Down Expand Up @@ -121,6 +125,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid zookeeper is not running
Expand All @@ -129,6 +134,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid postgres is not running
Expand All @@ -137,6 +143,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid overlord process is not running
Expand All @@ -145,6 +152,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid coordinator process is not running
Expand All @@ -153,6 +161,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid historical process is not running
Expand All @@ -161,6 +170,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid broker process is not running
Expand All @@ -169,6 +179,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: Druid middleManager process is not running
Expand All @@ -177,6 +188,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
message: {{`'Number of running processes are: {{$value}}'`}}
summary: redis-server process is not running
Expand All @@ -193,6 +205,7 @@ spec:
for: 1m
labels:
severity: fatal
module: dp
annotations:
{{`message: 'Druid health for Datasource is < {{ $value }}'`}}
summary: {{`'Druid health for Datasource is < {{ $value }}'`}}
Expand Down
Loading

0 comments on commit 1bd2969

Please sign in to comment.