diff --git a/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml b/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml index 44045d49ca..e457355990 100644 --- a/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml +++ b/kubernetes/ansible/roles/sunbird-monitoring/defaults/main.yml @@ -3,7 +3,6 @@ fullnameOverride: sunbird-monitoring monitoring_stack: - prometheus-operator - - kafka-lag-exporter - cassandra-jmx-exporter - elasticsearch-exporter - logger @@ -25,6 +24,8 @@ monitor_alerts_slack_url: monitor_alerts_slack_channel: env: dev kubernetes_cluster_name: kubernetes-1 +dp_monitor_alerts_slack_channel: "{{ env_name }}_flink_alerts" +dp_monitor_alerts_slack_url: "{{ dp_vault_monitor_alerts_slack_url }}" default_critical_mailing_list: "{{ default_mailing_list }}" default_fatal_mailing_list: "{{ default_mailing_list }}" @@ -169,4 +170,7 @@ kafka_telemetry_duplicate_backup_threshold: 1000 ### kafka exporters related vars processing_cluster_zookeeper: "{{ groups['processing-cluster-zookeepers'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:2181') | list}}" -processing_cluster_kafka: "{{ groups['processing-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}" +processing_cluster_kafka: "{{ groups['processing-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}" + +ingestion_cluster_zookeeper: "{{ groups['ingestion-cluster-zookeeper'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:2181') | list}}" +ingestion_cluster_kafka: "{{ groups['ingestion-cluster-kafka'] | difference(['localhost']) | map('regex_replace', '^(.*)$', '\\1:9092') | list}}" diff --git a/kubernetes/ansible/roles/sunbird-monitoring/templates/ingestion-kafka-exporter.yaml b/kubernetes/ansible/roles/sunbird-monitoring/templates/ingestion-kafka-exporter.yaml new file mode 100644 index 0000000000..7ddffd74ee --- /dev/null +++ b/kubernetes/ansible/roles/sunbird-monitoring/templates/ingestion-kafka-exporter.yaml @@ -0,0 +1,17 @@ +kafkaExporter: + zookeeper: + servers: ["{{ ingestion_cluster_zookeeper | join('","') }}"] + kafka: + servers: ["{{ ingestion_cluster_kafka | join('","') }}"] + additionalFlags: + - --use.consumelag.zookeeper + +prometheus: + serviceMonitor: + enabled: true + namespace: monitoring + interval: "120s" + scrapeTimeout: "90s" + additionalLabels: + app: prometheus-operator + release: prometheus-operator diff --git a/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml b/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml index 9c30eda7d7..39867a5a85 100644 --- a/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml +++ b/kubernetes/ansible/roles/sunbird-monitoring/templates/prometheus-operator.yaml @@ -45,6 +45,9 @@ alertmanager: routes: - receiver: slack continue: true + - match: + module: dp + receiver: dp-slack {% for item in alert_teams %} # Comment to ensure proper indentation while templating - match_re: @@ -82,6 +85,21 @@ alertmanager: {{ "{{" }} end {{ "}}" }} {{ "{{" }} end {{ "}}" }} icon_emoji: ':dart:' + - name: 'dp-slack' + slack_configs: + - send_resolved: true + api_url: "{{ dp_monitor_alerts_slack_url }}" + username: 'Monitor - Alerter' + channel: "{{ dp_monitor_alerts_slack_channel }}" + text: |- + {{ "{{" }} range .Alerts {{ "}}" }} + *Alert:* {{ "{{" }} .Annotations.message {{ "}}" }} - `{{ "{{" }} .Labels.severity {{ "}}" }}` + *Description:* {{ "{{" }} .Annotations.message {{ "}}" }} + *Details:* + {{ "{{" }} range .Labels.SortedPairs {{ "}}" }} • *{{ "{{" }} .Name {{ "}}" }}:* `{{ "{{" }} .Value {{ "}}" }}` + {{ "{{" }} end {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + icon_emoji: ':dart:' {% for item in alert_teams %} # Comment to ensure proper indentation while templating - name: "{{ item.team }}" diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesKafkaLag.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesKafkaLag.yml deleted file mode 100644 index 9ebc7c6c83..0000000000 --- a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesKafkaLag.yml +++ /dev/null @@ -1,130 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - role: alert-rules - app: {{ .Values.prometheus_rule_selector_app }} - release: {{ .Values.prometheus_rule_selector_release }} - name: {{ .Values.fullnameOverride }}-kafkalag-rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: alertrules.kafkalag - rules: - - alert: secor {{ .Values.kafka_topic_prefix }}.events.deviceprofile.backup group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.events.deviceprofile.backup"} > {{ .Values.kafka_events_deviceprofile_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.events.deviceprofile.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.events.deviceprofile.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.channel.backup group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.channel.backup"} > {{ .Values.kafka_events_channel_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.channel.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.channel.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.derived.backup group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.derived.backup"} > {{ .Values.kafka_telemetry_derived_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.derived.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.derived.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.failed.backup"} > {{ .Values.kafka_telemetry_failed_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.ingestion.backup"} > {{ .Values.kafka_telemetry_ingestion_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.ingestion.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.graph.events.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.graph.events.backup"} > {{ .Values.kafka_graph_events_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.graph.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.graph.events.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.raw.backup"} > {{ .Values.kafka_telemetry_raw_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.raw.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.unique.backup"} > {{ .Values.kafka_telemetry_unique_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.learning.failed.events.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.learning.failed.events.backup"} > {{ .Values.kafka_learning_failed_events_backup }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.learning.failed.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.learning.failed.events.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup"} > {{ .Values.kafka_telemetry_denorm_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.denorm.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.pipeline_metrics consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.pipeline_metrics"} > {{ .Values.kafka_telemetry_pipeline_metrics_backup }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.pipeline_metrics consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.pipeline_metrics - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed"} > {{ .Values.kafka_telemetry_extractor_failed_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.extractor.failed consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.extractor.failed - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess"} > {{ .Values.kafka_telemetry_assess_backup }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcess.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcess.yml index e849c4abd7..de525de026 100644 --- a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcess.yml +++ b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcess.yml @@ -49,6 +49,7 @@ spec: for: 1m labels: severity: critical + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: More than one process running @@ -57,6 +58,7 @@ spec: for: 1m labels: severity: critical + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Secor process is not running @@ -65,6 +67,7 @@ spec: for: 1m labels: severity: critical + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Secor process is not running @@ -73,6 +76,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Zookeeper process is not running @@ -121,6 +125,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid zookeeper is not running @@ -129,6 +134,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid postgres is not running @@ -137,6 +143,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid overlord process is not running @@ -145,6 +152,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid coordinator process is not running @@ -153,6 +161,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid historical process is not running @@ -161,6 +170,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid broker process is not running @@ -169,6 +179,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: Druid middleManager process is not running @@ -177,6 +188,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: message: {{`'Number of running processes are: {{$value}}'`}} summary: redis-server process is not running @@ -193,6 +205,7 @@ spec: for: 1m labels: severity: fatal + module: dp annotations: {{`message: 'Druid health for Datasource is < {{ $value }}'`}} summary: {{`'Druid health for Datasource is < {{ $value }}'`}} diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingKafkaLag.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingKafkaLag.yml deleted file mode 100644 index ea99aeb73b..0000000000 --- a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingKafkaLag.yml +++ /dev/null @@ -1,103 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - role: alert-rules - app: {{ .Values.prometheus_rule_selector_app }} - release: {{ .Values.prometheus_rule_selector_release }} - name: {{ .Values.fullnameOverride }}-kafkalag-rules - namespace: {{ .Values.namespace }} -spec: - groups: - - name: alertrules.kafkalag - rules: - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess.raw", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_assess_raw_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess.raw consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_denorm_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.denorm.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_derived_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.derived.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.druid.events.summary consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.druid.events.summary", job="processing-kafka-exporter"} > {{ .Values.kafka_druid_events_summary_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.druid.events.summary consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.druid.events.summary - - - alert: secor {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_extractor_duplicate_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.extractor.failed.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.failed.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_extractor_failed_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.failed.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.failed.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_failed_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.raw.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_raw_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.raw.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_duplicate_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup - - - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup consumer group lag - expr: kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.unique.backup", job="processing-kafka-exporter"} > {{ .Values.kafka_telemetry_unique_backup_threshold }} - for: 5m - labels: - severity: critical - annotations: - message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} - summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup diff --git a/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingSecorLag.yml b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingSecorLag.yml new file mode 100644 index 0000000000..1acf8b43b2 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/alertrules/templates/promrulesProcessingSecorLag.yml @@ -0,0 +1,214 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + app: {{ .Values.prometheus_rule_selector_app }} + release: {{ .Values.prometheus_rule_selector_release }} + name: {{ .Values.fullnameOverride }}-secor-lag-rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: alertrules.kafkalag + rules: + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess.raw", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_assess_raw_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess.raw consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess.raw + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_denorm_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.denorm.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.denorm.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_derived_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.derived.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.derived.unique.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_extractor_duplicate_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.duplicate.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.extractor.failed.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.extractor.failed.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_extractor_failed_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.extractor.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.extractor.failed.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.failed.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_failed_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.failed.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.failed.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.raw.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_raw_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.raw.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.raw.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_duplicate_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.duplicate.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.duplicate.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.unique.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_unique_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.unique.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.unique.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.ingest.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.ingest.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_ingestion_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.ingest.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.ingest.backup + + + - alert: secor {{ .Values.kafka_topic_prefix }}.events.device.profile.backup group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.events.device.profile.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_events_deviceprofile_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.events.device.profile.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.events.device.profile.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.failed.learning.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.failed.learning.events.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_learning_failed_events_backup }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.failed.learning.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.failed.learning.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.learning.graph.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.learning.graph.events.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_graph_events_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.learning.graph.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.learning.graph.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_assess_backup }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup", job="processing-kafka-exporter"}) > {{ .Values.kafka_telemetry_ingestion_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.ingestion.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.events.device.profile.backup group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.events.device.profile.backup", job="ingestion-kafka-exporter"}) > {{ .Values.kafka_events_deviceprofile_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.events.device.profile.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.events.device.profile.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.failed.learning.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.failed.learning.events.backup", job="ingestion-kafka-exporter"}) > {{ .Values.kafka_learning_failed_events_backup }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.failed.learning.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.failed.learning.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.learning.graph.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.learning.graph.events.backup", job="ingestion-kafka-exporter"}) > {{ .Values.kafka_graph_events_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.learning.graph.events.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.learning.graph.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup", job="ingestion-kafka-exporter"}) > {{ .Values.kafka_telemetry_assess_backup }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.assess consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.assess.events.backup + + - alert: secor {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup consumer group lag + expr: sum without(partition) (kafka_consumergroupzookeeper_lag_zookeeper{consumergroup="{{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup", job="ingestion-kafka-exporter"}) > {{ .Values.kafka_telemetry_ingestion_backup_threshold }} + for: 5m + labels: + severity: critical + module: dp + annotations: + message: {{`"`}}{{ .Values.kafka_topic_prefix }}{{`.telemetry.ingestion.backup consumer group lag is {{$value}} for partition: {{ $labels.partition }}"`}} + summary: secor consumer group lag is more for {{ .Values.kafka_topic_prefix }}.telemetry.ingestion.events.backup diff --git a/kubernetes/helm_charts/monitoring/alertrules/values.yaml b/kubernetes/helm_charts/monitoring/alertrules/values.yaml index 67d9576a8d..d5507fbbc2 100644 --- a/kubernetes/helm_charts/monitoring/alertrules/values.yaml +++ b/kubernetes/helm_charts/monitoring/alertrules/values.yaml @@ -44,5 +44,5 @@ node_disk_usage_percentage_threshold_Critical: 85 node_disk_usage_percentage_threshold_Fatal: 95 #secor job count -secor_job_count: 13 -processingsecor_job_count: 11 +secor_job_count: 15 +processingsecor_job_count: 15 diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/.helmignore b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/.helmignore new file mode 100644 index 0000000000..50af031725 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/Chart.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/Chart.yaml new file mode 100644 index 0000000000..58c57ab2ca --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +appVersion: "1.0" +description: A Helm chart for Kubernetes +name: ingestion-kafka-exporter +version: 1.0.0 +home: https://github.com/abhishekjiitr/kafka-exporter-helm +maintainers: + - name: abhishekjiitr + email: abhi2254015@gmail.com diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/NOTES.txt b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/NOTES.txt new file mode 100644 index 0000000000..d9eb9e809b --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/NOTES.txt @@ -0,0 +1,5 @@ +1.To see the metrics +{{- if contains "ClusterIP" .Values.service.type }} + kubectl port-forward svc/{{ include "kafka-exporter.fullname" . }} {{ .Values.service.port }} + echo "Visit http://127.0.0.1:{{ .Values.service.port }} to use your application" +{{- end }} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/_helpers.tpl b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/_helpers.tpl new file mode 100644 index 0000000000..bc51bbfcd5 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "kafka-exporter.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "kafka-exporter.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "kafka-exporter.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/alertRules.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/alertRules.yaml new file mode 100644 index 0000000000..f96d83b235 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/alertRules.yaml @@ -0,0 +1,52 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "kafka-exporter.fullname" . }} + {{- if .Values.prometheus.serviceMonitor.namespace }} + namespace: {{ .Values.prometheus.serviceMonitor.namespace }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + {{- if .Values.prometheus.serviceMonitor.additionalLabels }} +{{ toYaml .Values.prometheus.serviceMonitor.additionalLabels | indent 4 -}} + {{- end }} +spec: + groups: + - name: alertrules.kafkaExporter + rules: + - record: isr_partition_difference + expr: kafka_topic_partition_in_sync_replica{topic!="__consumer_offsets",topic!="__samza_*"} - kafka_topic_partition_replicas{topic!="__consumer_offsets",topic!="__samza_*"} + + - alert: ISR_not_equal_to_partition + annotations: + message: {{`"ISR and replication difference for $labels.topic is > 0 current value: {{ humanize $value }}"`}} + summary: {{`"ISR and Replica Mismatch for {{$labels.topic}}"`}} + expr: isr_partition_difference != 0 + for: 5m + labels: + severity: critical + module: dp + + - alert: Kafka_partition_leader_change_rapidly + annotations: + message: {{`"Kafaka Partition leader changing rapidly for {{ $labels.topic }}"`}} + summary: {{`"Kafaka Partition leader changing rapidly for {{ $labels.topic }}"`}} + expr: rate(kafka_topic_partition_leader{topic!="__consumer_offsets",topic!="__samza_*"}[5m]) > 0 + for: 5m + labels: + severity: critical + module: dp + + - alert: kafka_broker_unavailable + annotations: + summary: {{`"Kafka brokers unavailable"`}} + message: "There are only {{`{{humanize $value}}`}} kafka brokers available; Expected count: {{len .Values.kafkaExporter.kafka.servers}}" + expr: kafka_brokers < {{ len .Values.kafkaExporter.kafka.servers }} + for: 5m + labels: + severity: critical + module: dp diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/deployment.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/deployment.yaml new file mode 100644 index 0000000000..d79b57b019 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/deployment.yaml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "kafka-exporter.fullname" . }} + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + {{- if .Values.kafkaExporter}} + {{- range .Values.kafkaExporter.kafka.servers }} + - "--kafka.server={{ . }}" + {{- end }} + {{- range .Values.kafkaExporter.zookeeper.servers }} + - "--zookeeper.server={{ . }}" + {{- end }} + {{- range .Values.kafkaExporter.additionalFlags }} + - "{{ . }}" + {{- end }} + {{- if .Values.kafkaExporter.kafka.version }} + - --kafka.version={{ .Values.kafkaExporter.kafka.version }} + {{- end }} + {{- end}} + {{- if .Values.kafkaExporter.sasl.enabled }} + - --sasl.enabled + {{- if not .Values.kafkaExporter.sasl.handshake }} + - --sasl.handshake=false + {{- end }} + - --sasl.username={{ .Values.kafkaExporter.sasl.username }} + - --sasl.password={{ .Values.kafkaExporter.sasl.password }} + {{- end }} + {{- if .Values.kafkaExporter.tls.enabled}} + - --tls.enabled + - --tls.ca-file=/etc/tls-certs/ca-file + - --tls.cert-file=/etc/tls-certs/cert-file + - --tls.key-file=/etc/tls-certs/key-file + {{- end }} + {{- if .Values.kafkaExporter.log }} + - --log.level={{ .Values.kafkaExporter.log.level }} + {{- if .Values.kafkaExporter.log.enableSarama }} + - --log.enable-sarama + {{- end }} + {{- end }} + ports: + - name: metrics + containerPort: 9308 + protocol: TCP + {{- if .Values.kafkaExporter.tls.enabled }} + volumeMounts: + - name: tls-certs + mountPath: "/etc/tls-certs/" + readOnly: true + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.kafkaExporter.tls.enabled }} + volumes: + - name: tls-certs + secret: + secretName: {{ include "kafka-exporter.fullname" . }} + {{- end }} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/secret.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/secret.yaml new file mode 100644 index 0000000000..82f567f38d --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/secret.yaml @@ -0,0 +1,15 @@ +{{- if .Values.kafkaExporter.tls.enabled }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "kafka-exporter.fullname" . }} + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +data: + ca-file: {{ .Values.kafkaExporter.tls.caFile | b64enc }} + cert-file: {{ .Values.kafkaExporter.tls.certFile | b64enc }} + key-file: {{ .Values.kafkaExporter.tls.keyFile | b64enc }} +{{- end }} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/service.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/service.yaml new file mode 100644 index 0000000000..049041fb03 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kafka-exporter.fullname" . }} + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: metrics + protocol: TCP + name: metrics + selector: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/servicemonitor.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/servicemonitor.yaml new file mode 100644 index 0000000000..395d617dd3 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/templates/servicemonitor.yaml @@ -0,0 +1,33 @@ +{{- if .Values.prometheus.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "kafka-exporter.fullname" . }} + {{- if .Values.prometheus.serviceMonitor.namespace }} + namespace: {{ .Values.prometheus.serviceMonitor.namespace }} + {{- end }} + labels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + {{- if .Values.prometheus.serviceMonitor.additionalLabels }} +{{ toYaml .Values.prometheus.serviceMonitor.additionalLabels | indent 4 -}} + {{- end }} +spec: + jobLabel: jobLabel + selector: + matchLabels: + app.kubernetes.io/name: {{ include "kafka-exporter.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + helm.sh/chart: {{ include "kafka-exporter.chart" . }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: metrics + interval: {{ .Values.prometheus.serviceMonitor.interval }} + {{- if .Values.prometheus.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ .Values.prometheus.serviceMonitor.scrapeTimeout }} + {{- end }} +{{- end }} diff --git a/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/values.yaml b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/values.yaml new file mode 100644 index 0000000000..4292f73508 --- /dev/null +++ b/kubernetes/helm_charts/monitoring/ingestion-kafka-exporter/values.yaml @@ -0,0 +1,69 @@ +# Default values for kafka-exporter. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: danielqsj/kafka-exporter + tag: latest + pullPolicy: IfNotPresent + +nameOverride: "" +fullnameOverride: "" + +service: + type: ClusterIP + port: 9308 + +kafkaExporter: + kafka: + servers: [] + zookeeper: + servers: [] + additionalFlags: [] + # - --use.consumelag.zookeeper + + sasl: + enabled: false + handshake: true + username: "" + password: "" + + tls: + enabled: false + insecure-skip-tls-verify: false + caFile: "" + certFile: "" + keyFile: "" + + log: + level: info + enableSarama: false + +prometheus: + serviceMonitor: + enabled: true + namespace: monitoring + interval: "30s" + additionalLabels: + app: kafka-exporter + + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {}