Skip to content

Commit

Permalink
Issue #0000 fix: updated the secor backup alert rules vars (#1743)
Browse files Browse the repository at this point in the history
* Issue #0000 fix: updated the secor backup alert rules vars

* Issue #0000 fix: renamed the chart name
  • Loading branch information
Kaali09 authored Jul 21, 2020
1 parent 9d08bcb commit 7c69af3
Show file tree
Hide file tree
Showing 16 changed files with 484 additions and 8 deletions.
8 changes: 6 additions & 2 deletions kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,12 @@ kafka_graph_events_backup_threshold: 500
kafka_telemetry_raw_backup_threshold: 10000
kafka_telemetry_unique_backup_threshold: 10000
kafka_learning_failed_events_backup: 10000
kafka_telemetry_denorm_backup: 10000
kafka_telemetry_denorm_backup_threshold: 10000
kafka_telemetry_pipeline_metrics_backup: 1000
kafka_telemetry_extractor_failed_backup: 1000
kafka_telemetry_extractor_failed_backup_threshold: 1000
kafka_telemetry_assess_backup: 1000
kafka_telemetry_assess_raw_backup_threshold: 1000
kafka_druid_events_summary_backup_threshold: 10000
kafka_telemetry_extractor_duplicate_backup_threshold: 1000
kafka_telemetry_duplicate_backup_threshold: 1000

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ kafka_graph_events_backup_threshold: "{{ kafka_graph_events_backup_threshold }}"
kafka_telemetry_raw_backup_threshold: "{{ kafka_telemetry_raw_backup_threshold }}"
kafka_telemetry_unique_backup_threshold: "{{ kafka_telemetry_unique_backup_threshold }}"
kafka_learning_failed_events_backup: "{{ kafka_learning_failed_events_backup }}"
kafka_telemetry_denorm_backup: "{{ kafka_telemetry_denorm_backup }}"
kafka_telemetry_denorm_backup_threshold: "{{ kafka_telemetry_denorm_backup_threshold }}"
kafka_telemetry_pipeline_metrics_backup: "{{ kafka_telemetry_pipeline_metrics_backup }}"
kafka_telemetry_extractor_failed_backup: "{{ kafka_telemetry_extractor_failed_backup }}"
kafka_telemetry_extractor_failed_backup_threshold: "{{ kafka_telemetry_extractor_failed_backup_threshold }}"
kafka_telemetry_assess_backup: "{{ kafka_telemetry_assess_backup }}"
kafka_telemetry_assess_raw_backup_threshold: "{{ kafka_telemetry_assess_raw_backup_threshold }}"
kafka_druid_events_summary_backup_threshold: "{{ kafka_druid_events_summary_backup_threshold }}"
kafka_telemetry_extractor_duplicate_backup_threshold: "{{ kafka_telemetry_extractor_duplicate_backup_threshold }}"
kafka_telemetry_duplicate_backup_threshold: "{{ kafka_telemetry_duplicate_backup_threshold }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
kafkaExporter:
zookeeper:
servers: ["{{ groups['processing-cluster-zookeepers'] | difference(["localhost"]) | map('regex_replace', '^(.*)$', '\\1:2181') | list | join("\", \"") }}"]
kafka:
servers: ["{{ groups['processing-cluster-kafka'] | difference(["localhost"]) | map('regex_replace', '^(.*)$', '\\1:9092') | list | join("\", \"") }}"]
additionalFlags:
- --use.consumelag.zookeeper

prometheus:
serviceMonitor:
enabled: true
namespace: monitoring
interval: "120s"
scrapeTimeout: "90s"
additionalLabels:
app: prometheus-operator
release: prometheus-operator
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ spec:
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.learning.failed.events.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup"} > {{ .Values.kafka_telemetry_denorm_backup }}
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup"} > {{ .Values.kafka_telemetry_denorm_backup_threshold }}
for: 5m
labels:
severity: critical
Expand All @@ -112,7 +112,7 @@ spec:
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.pipeline_metrics

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed"} > {{ .Values.kafka_telemetry_extractor_failed_backup }}
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed"} > {{ .Values.kafka_telemetry_extractor_failed_backup_threshold }}
for: 5m
labels:
severity: critical
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
role: alert-rules
app: {{ .Values.prometheus_rule_selector_app }}
release: {{ .Values.prometheus_rule_selector_release }}
name: {{ .Values.fullnameOverride }}-kafkalag-rules
namespace: {{ .Values.namespace }}
spec:
groups:
- name: alertrules.kafkalag
rules:
- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess.raw", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_assess_raw_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess.raw consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_denorm_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.denorm.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_derived_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.derived.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.druid.events.summary consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.druid.events.summary", job="processing-kafka-exporter"} > {{ .Values.kafka_druid_events_summary_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.druid.events.summary consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.druid.events.summary

- alert: secor {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_extractor_duplicate_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.extractor.failed.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.failed.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_extractor_failed_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.failed.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.failed.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_failed_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.raw.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_raw_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.raw.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_duplicate_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup

- alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup consumer group lag
expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.unique.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_unique_backup_threshold }}
for: 5m
labels:
severity: critical
annotations:
message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}}
summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup
8 changes: 6 additions & 2 deletions kubernetes/helm_charts/monitoring/alertrules/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,14 @@ kafka_graph_events_backup_threshold: 500
kafka_telemetry_raw_backup_threshold: 10000
kafka_telemetry_unique_backup_threshold: 10000
kafka_learning_failed_events_backup: 10000
kafka_telemetry_denorm_backup: 10000
kafka_telemetry_denorm_backup_threshold: 10000
kafka_telemetry_pipeline_metrics_backup: 1000
kafka_telemetry_extractor_failed_backup: 1000
kafka_telemetry_extractor_failed_backup_threshold: 1000
kafka_telemetry_assess_backup: 1000
kafka_telemetry_assess_raw_backup_threshold: 1000
kafka_druid_events_summary_backup_threshold: 10000
kafka_telemetry_extractor_duplicate_backup_threshold: 1000
kafka_telemetry_duplicate_backup_threshold: 1000

# Node Exporter vars
node_cpu_usage_percentage_threshold_Warning: 75
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart for Kubernetes
name: processing-kafka-exporter
version: 1.0.0
home: https://github.com/abhishekjiitr/kafka-exporter-helm
maintainers:
- name: abhishekjiitr
email: [email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1.To see the metrics
{{- if contains "ClusterIP" .Values.service.type }}
kubectl port-forward svc/{{ include "kafka-exporter.fullname" . }} {{ .Values.service.port }}
echo "Visit http://127.0.0.1:{{ .Values.service.port }} to use your application"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "kafka-exporter.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "kafka-exporter.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "kafka-exporter.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "kafka-exporter.fullname" . }}
{{- if .Values.prometheus.serviceMonitor.namespace }}
namespace: {{ .Values.prometheus.serviceMonitor.namespace }}
{{- end }}
labels:
app.kubernetes.io/name: {{ include "kafka-exporter.name" . }}
helm.sh/chart: {{ include "kafka-exporter.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- if .Values.prometheus.serviceMonitor.additionalLabels }}
{{ toYaml .Values.prometheus.serviceMonitor.additionalLabels | indent 4 -}}
{{- end }}
spec:
groups:
- name: alertrules.kafkaExporter
rules:
- record: isr_partition_difference
expr: kafka_topic_partition_in_sync_replica{topic!="__consumer_offsets",topic!="__samza_*"} - kafka_topic_partition_replicas{topic!="__consumer_offsets",topic!="__samza_*"}

- alert: ISR_not_equal_to_partition
annotations:
message: {{`"ISR and replication difference for $labels.topic is > 0 current value: {{ humanize $value }}"`}}
summary: {{`"ISR and Replica Mismatch for {{$labels.topic}}"`}}
expr: isr_partition_difference != 0
for: 5m
labels:
severity: critical

- alert: Kafka_partition_leader_change_rapidly
annotations:
message: {{`"Kafaka Partition leader changing rapidly for {{ $labels.topic }}"`}}
summary: {{`"Kafaka Partition leader changing rapidly for {{ $labels.topic }}"`}}
expr: rate(kafka_topic_partition_leader{topic!="__consumer_offsets",topic!="__samza_*"}[5m]) > 0
for: 5m
labels:
severity: critical

- alert: kafka_broker_unavailable
annotations:
summary: {{`"Kafka brokers unavailable"`}}
message: "There are only {{`{{humanize $value}}`}} kafka brokers available; Expected count: {{len .Values.kafkaExporter.kafka.servers}}"
expr: kafka_brokers < {{ len .Values.kafkaExporter.kafka.servers }}
for: 5m
labels:
severity: critical
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "kafka-exporter.fullname" . }}
labels:
app.kubernetes.io/name: {{ include "kafka-exporter.name" . }}
helm.sh/chart: {{ include "kafka-exporter.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app.kubernetes.io/name: {{ include "kafka-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
template:
metadata:
labels:
app.kubernetes.io/name: {{ include "kafka-exporter.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
{{- if .Values.kafkaExporter}}
{{- range .Values.kafkaExporter.kafka.servers }}
- "--kafka.server={{ . }}"
{{- end }}
{{- range .Values.kafkaExporter.zookeeper.servers }}
- "--zookeeper.server={{ . }}"
{{- end }}
{{- range .Values.kafkaExporter.additionalFlags }}
- "{{ . }}"
{{- end }}
{{- if .Values.kafkaExporter.kafka.version }}
- --kafka.version={{ .Values.kafkaExporter.kafka.version }}
{{- end }}
{{- end}}
{{- if .Values.kafkaExporter.sasl.enabled }}
- --sasl.enabled
{{- if not .Values.kafkaExporter.sasl.handshake }}
- --sasl.handshake=false
{{- end }}
- --sasl.username={{ .Values.kafkaExporter.sasl.username }}
- --sasl.password={{ .Values.kafkaExporter.sasl.password }}
{{- end }}
{{- if .Values.kafkaExporter.tls.enabled}}
- --tls.enabled
- --tls.ca-file=/etc/tls-certs/ca-file
- --tls.cert-file=/etc/tls-certs/cert-file
- --tls.key-file=/etc/tls-certs/key-file
{{- end }}
{{- if .Values.kafkaExporter.log }}
- --log.level={{ .Values.kafkaExporter.log.level }}
{{- if .Values.kafkaExporter.log.enableSarama }}
- --log.enable-sarama
{{- end }}
{{- end }}
ports:
- name: metrics
containerPort: 9308
protocol: TCP
{{- if .Values.kafkaExporter.tls.enabled }}
volumeMounts:
- name: tls-certs
mountPath: "/etc/tls-certs/"
readOnly: true
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- with .Values.nodeSelector }}

nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.kafkaExporter.tls.enabled }}
volumes:
- name: tls-certs
secret:
secretName: {{ include "kafka-exporter.fullname" . }}
{{- end }}
Loading

0 comments on commit 7c69af3

Please sign in to comment.