From 484946deeab2cd50f11cadeacb9566791f5ed503 Mon Sep 17 00:00:00 2001 From: mphanias <121247041+mphanias@users.noreply.github.com> Date: Tue, 7 Nov 2023 12:01:55 +0530 Subject: [PATCH] OM130 - 7.0 alert rule changes (#82) enhanced alerts to include 7.0 metrics - added 5 alerts in namespace - added 2 alerts in sets updated alerts template and template json values for new variables --- config/prometheus/aerospike_rules.yml | 115 ++++++++++++++---- .../templates/aerospike_alert_rules.template | 101 ++++++++++++--- .../templates/alert_config_data.json | 5 +- 3 files changed, 175 insertions(+), 46 deletions(-) diff --git a/config/prometheus/aerospike_rules.yml b/config/prometheus/aerospike_rules.yml index 69f0258..4da8a53 100644 --- a/config/prometheus/aerospike_rules.yml +++ b/config/prometheus/aerospike_rules.yml @@ -127,7 +127,7 @@ groups: description: "Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}" - alert: NamespaceSupervisorFallingBehind - expr: aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later) + expr: aerospike_namespace_objects{job="aerospike"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later) for: 30s labels: severity: critical @@ -163,7 +163,7 @@ groups: description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." - alert: ClientTimeouts - expr: rate(aerospike_namespace_client_read_timeout{job="aerospike"}[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1 + expr: rate(aerospike_namespace_client_read_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1 for: 1m labels: severity: critical @@ -243,6 +243,51 @@ groups: summary: "There are unavailable partition, but all roster nodes are present in the cluster." description: "Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again." + - alert: NamespaceDataCloseToStopWrites + expr: (aerospike_namespace_data_avail_pct{job="aerospike" } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct" + description: "data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit." + + - alert: LowDataAvailWarning + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 55 + for: 30s + labels: + severity: warn + annotations: + summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: LowDataAvailCritical + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 25 + for: 30s + labels: + severity: critical + annotations: + summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: HighDataUseNamespaceWarning + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 80 + for: 30s + labels: + severity: warn + annotations: + summary: "Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: HighDataUseNamespaceCritical + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 85 + for: 30s + labels: + severity: critical + annotations: + summary: "Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + - name: aerospike_aerospike.rules > NODE rules: - alert: PrometheusNodeExporterNotPresent @@ -286,7 +331,7 @@ groups: labels: severity: critical annotations: - summary: "Client connections warning" + summary: "Client connections critical" description: "Client connections are greater than expected peak of 10000." - alert: ClientConnectionChurn @@ -355,24 +400,42 @@ groups: - name: aerospike_aerospike.rules > SET rules: - - alert: NamespaceSetQuotaWarning - expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 - for: 30s - labels: - severity: critical - annotations: - description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." - summary: "One of your nodes is at % of the quota you have configured on the set." + - alert: pre7x_NamespaceSetQuotaWarning + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." - - alert: NamespaceSetQuotaAlert - expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 - for: 30s - labels: - severity: critical - annotations: - description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." - summary: "One of your nodes is at % of the quota you have configured on the set." + - alert: pre7x_NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + - alert: NamespaceSetQuotaWarning + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + - name: aerospike_aerospike.rules > LATENCIES rules: - alert: ReadLatencyP95Warning @@ -434,7 +497,7 @@ groups: rules: - alert: XDRTimelag - expr: aerospike_xdr_lag{job="aerospike" } > 5 + expr: aerospike_xdr_lag{job="aerospike" } > 5 for: 2m labels: severity: warn @@ -450,7 +513,7 @@ groups: summary: "Abandoned records detected for XDR on node {{ $labels.instance }} to DC {{ $labels.dc }}" description: "Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination." - alert: XDRRetryNoNode - expr: rate(aerospike_xdr_retry_no_node{job="aerospike" , }[1m]) > 0 + expr: rate(aerospike_xdr_retry_no_node{job="aerospike" }[1m]) > 0 for: 30s labels: severity: warn @@ -459,7 +522,7 @@ groups: description: "XDR cannot determine which destination node is the master." - alert: XDRRetryConnReset - expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" , }[1m]) > 2 + expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" }[1m]) > 2 for: 2m labels: severity: warn @@ -468,7 +531,7 @@ groups: description: "XDR retries occuring due to due to timeouts, network problems, or destination node restarts." - alert: XDRRetryDest - expr: rate(aerospike_xdr_retry_dest{job="aerospike" ,}[1m]) > 5 + expr: rate(aerospike_xdr_retry_dest{job="aerospike" }[1m]) > 5 for: 2m labels: severity: warn @@ -477,7 +540,7 @@ groups: description: "XDR retries due to errors returned by the destination node, u.e. key busy or device overload." - alert: XDRLatencyWarning - expr: aerospike_xdr_latency_ms{job="aerospike" , } > 100 + expr: aerospike_xdr_latency_ms{job="aerospike" } > 100 for: 30s labels: severity: warn @@ -486,7 +549,7 @@ groups: description: "Network latency between XDR source and destination over the last 30s is higher than expected." - alert: XDRLap - expr: aerospike_xdr_lap_us{job="aerospike" , } > 75000 + expr: aerospike_xdr_lap_us{job="aerospike" } > 75000 for: 30s labels: severity: warn @@ -495,7 +558,7 @@ groups: description: "The XDR processing cycle time (lap_us) is approaching the configured period-ms value." - alert: XDRRecoveries - expr: increase(aerospike_xdr_recoveries{job="aerospike" , }[1m]) > 0 + expr: increase(aerospike_xdr_recoveries{job="aerospike" }[1m]) > 0 for: 2m labels: severity: critical diff --git a/config/prometheus/templates/aerospike_alert_rules.template b/config/prometheus/templates/aerospike_alert_rules.template index 63f6efd..90b0d18 100644 --- a/config/prometheus/templates/aerospike_alert_rules.template +++ b/config/prometheus/templates/aerospike_alert_rules.template @@ -19,7 +19,7 @@ groups: summary: "{% raw %}Node {{ $labels.instance }} down{% endraw %}" description: "{% raw %}{{ $labels.instance }} node is down.{% endraw %}" -{% for thresholds in alertmanager_aerospike_metric_thresholds %} +{% for thresholds in alertmanager_aerospike_metric_cluster_thresholds %} - name: aerospike_{{ thresholds.cluster }}.rules > NAMESPACE rules: - alert: NamespaceStopWrites @@ -127,7 +127,7 @@ groups: description: "{% raw %}Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}{% endraw %}" - alert: NamespaceSupervisorFallingBehind - expr: aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}"} > 1 # (Aerospike 6.3 and later) + expr: aerospike_namespace_objects{job="aerospike", cluster_name="{{ thresholds.cluster }}"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}"} > 1 # (Aerospike 6.3 and later) for: {{ thresholds.aerospike_nsup_fall_behind_duration | default(aerospike_nsup_fall_behind_duration) }} labels: severity: critical @@ -243,6 +243,51 @@ groups: summary: "There are unavailable partition, but all roster nodes are present in the cluster." description: "{% raw %}Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again.{% endraw %}" + - alert: NamespaceDataCloseToStopWrites + expr: (aerospike_namespace_data_avail_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}" } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}" }) <= 10 + for: {{ thresholds.aerospike_namespace_stop_writes_duration | default(aerospike_namespace_stop_writes_duration) }} + labels: + severity: warn + annotations: + summary: "{% raw %}Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct{% endraw %}" + description: "{% raw %}data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit.{% endraw %}" + + - alert: LowDataAvailWarning + expr: aerospike_namespace_data_avail_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}" } < 55 + for: {{ thresholds.aerospike_low_data_avail_warning_duration | default(aerospike_low_data_avail_warning_duration) }} + labels: + severity: warn + annotations: + summary: "{% raw %}Device available warning for {{ $labels.instance }}/{{ $labels.ns }}{% endraw %}" + description: "{% raw %}Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop.{% endraw %}" + + - alert: LowDataAvailCritical + expr: aerospike_namespace_data_avail_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}" } < 25 + for: {{ thresholds.aerospike_low_data_avail_warning_duration | default(aerospike_low_data_avail_warning_duration) }} + labels: + severity: critical + annotations: + summary: "{% raw %}Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}{% endraw %}" + description: "{% raw %}Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop.{% endraw %}" + + - alert: HighDataUseNamespaceWarning + expr: aerospike_namespace_data_used_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}", storage_engine="memory" } > 80 + for: {{ thresholds.aerospike_high_data_namespace_warning_duration | default(aerospike_high_data_namespace_warning_duration) }} + labels: + severity: warn + annotations: + summary: "{% raw %}Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}{% endraw %}" + description: "{% raw %}Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity.{% endraw %}" + + - alert: HighDataUseNamespaceCritical + expr: aerospike_namespace_data_used_pct{job="aerospike", cluster_name="{{ thresholds.cluster }}", storage_engine="memory" } > 85 + for: {{ thresholds.aerospike_high_data_namespace_warning_duration | default(aerospike_high_data_namespace_warning_duration) }} + labels: + severity: critical + annotations: + summary: "{% raw %}Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}{% endraw %}" + description: "{% raw %}Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity.{% endraw %}" + - name: aerospike_{{ thresholds.cluster }}.rules > NODE rules: - alert: PrometheusNodeExporterNotPresent @@ -286,7 +331,7 @@ groups: labels: severity: critical annotations: - summary: "Client connections warning" + summary: "Client connections critical" description: "Client connections are greater than expected peak of {{ thresholds.client_connections_critical | default(aerospike_client_connections_critical_level) }}." - alert: ClientConnectionChurn @@ -355,24 +400,42 @@ groups: - name: aerospike_{{ thresholds.cluster }}.rules > SET rules: - - alert: NamespaceSetQuotaWarning - expr: (((aerospike_sets_device_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"} + aerospike_sets_memory_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"}) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}"} != 0)) * 100) > {{ thresholds.aerospike_set_quota_warning_pct | default(aerospike_set_quota_warning_pct) }} - for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} - labels: - severity: critical - annotations: - description: "{% raw %}Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" - summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_warning_pct }} {% raw %}% of the quota you have configured on the set.{% endraw %}" + - alert: pre7x_NamespaceSetQuotaWarning + expr: (((aerospike_sets_device_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"} + aerospike_sets_memory_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"}) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}"} != 0)) * 100) > {{ thresholds.aerospike_set_quota_warning_pct | default(aerospike_set_quota_warning_pct) }} + for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} + labels: + severity: warn + annotations: + description: "{% raw %}Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" + summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_warning_pct }} {% raw %}% of the quota you have configured on the set.{% endraw %}" - - alert: NamespaceSetQuotaAlert - expr: (((aerospike_sets_device_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"} + aerospike_sets_memory_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"}) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}"} != 0)) * 100) > {{ thresholds.aerospike_set_quota_alert_pct | default(aerospike_set_quota_alert_pct) }} - for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} - labels: - severity: critical - annotations: - description: "{% raw %}At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" - summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_alert_pct }}{% raw %}% of the quota you have configured on the set.{% endraw %}" + - alert: pre7x_NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_device_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"} + aerospike_sets_memory_data_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}"}) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}"} != 0)) * 100) > {{ thresholds.aerospike_set_quota_alert_pct | default(aerospike_set_quota_alert_pct) }} + for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} + labels: + severity: critical + annotations: + description: "{% raw %}At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" + summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_alert_pct }}{% raw %}% of the quota you have configured on the set.{% endraw %}" + - alert: NamespaceSetQuotaWarning + expr: (((aerospike_sets_data_used_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}" } ) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}" } != 0)) * 100) > 80 + for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} + labels: + severity: warn + annotations: + description: "{% raw %}Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" + summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_alert_pct }}{% raw %}% of the quota you have configured on the set.{% endraw %}" + + - alert: NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_data_used_bytes{job="aerospike", cluster_name="{{ thresholds.cluster }}" } ) / (aerospike_sets_stop_writes_size{job="aerospike", cluster_name="{{ thresholds.cluster }}" } != 0)) * 100) > 99 + for: {{ thresholds.aerospike_set_quota_duration | default(aerospike_set_quota_duration) }} + labels: + severity: critical + annotations: + description: "{% raw %}At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}.{% endraw %}" + summary: "{% raw %}One of your nodes is at{% endraw %} {{ thresholds.aerospike_set_quota_alert_pct }}{% raw %}% of the quota you have configured on the set.{% endraw %}" + - name: aerospike_{{ thresholds.cluster }}.rules > LATENCIES rules: - alert: ReadLatencyP95Warning diff --git a/config/prometheus/templates/alert_config_data.json b/config/prometheus/templates/alert_config_data.json index 5bb7238..2608b5b 100644 --- a/config/prometheus/templates/alert_config_data.json +++ b/config/prometheus/templates/alert_config_data.json @@ -104,7 +104,10 @@ "aerospike_sindex_stage_size_range_duration": "1m", "aerospike_sindex_stage_size_suggested_max":"4000000000", - "alertmanager_aerospike_metric_thresholds": [ + "aerospike_low_data_avail_warning_duration": "30s", + "aerospike_high_data_namespace_warning_duration": "30s", + + "alertmanager_aerospike_metric_cluster_thresholds": [ { "cluster": "aerospike", "grafana_url":"http://localhost:7100",