From 57c8e19eab9d00524c52dc43bfdeee9e54b834c5 Mon Sep 17 00:00:00 2001 From: Tanmay Jain <103629776+tanmayja@users.noreply.github.com> Date: Wed, 28 Aug 2024 20:34:50 +0530 Subject: [PATCH 1/2] [KO-344] Montoring stack (#300) * Basic monitoring stack deployment --- .../alertmanager/config/alertmanager.yml | 12 + .../alertmanager/kustomization.yaml | 21 + config/monitoring/alertmanager/pvc.yaml | 10 + config/monitoring/alertmanager/service.yaml | 8 + .../monitoring/alertmanager/statefulset.yaml | 43 + .../aerospike_grafana_dashboards_config.yaml | 11 + .../config/aerospike_grafana_datasource.yaml | 8 + .../grafana/config/download_files.sh | 40 + config/monitoring/grafana/config/grafana.ini | 13 + config/monitoring/grafana/kustomization.yaml | 30 + config/monitoring/grafana/pvc.yaml | 10 + config/monitoring/grafana/service.yaml | 10 + config/monitoring/grafana/statefulset.yaml | 88 ++ config/monitoring/kustomization.yaml | 16 + config/monitoring/prometheus/clusterrole.yaml | 23 + .../prometheus/clusterrolebinding.yaml | 12 + .../config/alert-rules/aerospike_rules.yml | 566 +++++++++++++ .../alert-rules/node_exporter_alerts.yml | 777 ++++++++++++++++++ .../prometheus/config/prometheus.yml | 88 ++ .../monitoring/prometheus/kustomization.yaml | 28 + config/monitoring/prometheus/pvc.yaml | 10 + config/monitoring/prometheus/service.yaml | 11 + .../monitoring/prometheus/serviceaccount.yaml | 4 + config/monitoring/prometheus/statefulset.yaml | 61 ++ .../aerospike-kubernetes-operator/values.yaml | 2 +- 25 files changed, 1901 insertions(+), 1 deletion(-) create mode 100644 config/monitoring/alertmanager/config/alertmanager.yml create mode 100644 config/monitoring/alertmanager/kustomization.yaml create mode 100644 config/monitoring/alertmanager/pvc.yaml create mode 100644 config/monitoring/alertmanager/service.yaml create mode 100644 config/monitoring/alertmanager/statefulset.yaml create mode 100644 config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml create mode 100644 config/monitoring/grafana/config/aerospike_grafana_datasource.yaml create mode 100644 config/monitoring/grafana/config/download_files.sh create mode 100644 config/monitoring/grafana/config/grafana.ini create mode 100644 config/monitoring/grafana/kustomization.yaml create mode 100644 config/monitoring/grafana/pvc.yaml create mode 100644 config/monitoring/grafana/service.yaml create mode 100644 config/monitoring/grafana/statefulset.yaml create mode 100644 config/monitoring/kustomization.yaml create mode 100644 config/monitoring/prometheus/clusterrole.yaml create mode 100644 config/monitoring/prometheus/clusterrolebinding.yaml create mode 100644 config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml create mode 100644 config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml create mode 100644 config/monitoring/prometheus/config/prometheus.yml create mode 100644 config/monitoring/prometheus/kustomization.yaml create mode 100644 config/monitoring/prometheus/pvc.yaml create mode 100644 config/monitoring/prometheus/service.yaml create mode 100644 config/monitoring/prometheus/serviceaccount.yaml create mode 100644 config/monitoring/prometheus/statefulset.yaml diff --git a/config/monitoring/alertmanager/config/alertmanager.yml b/config/monitoring/alertmanager/config/alertmanager.yml new file mode 100644 index 000000000..4908f2377 --- /dev/null +++ b/config/monitoring/alertmanager/config/alertmanager.yml @@ -0,0 +1,12 @@ +# This is an example alertmanager.yml which sends alert notifications to a slack channel. + +global: + slack_api_url: "https://hooks.slack.com/services/TXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +route: + group_by: ['cluster', 'service'] + receiver: slack_user + +receivers: + - name: slack_user + slack_configs: + - text: "summary: {{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}" \ No newline at end of file diff --git a/config/monitoring/alertmanager/kustomization.yaml b/config/monitoring/alertmanager/kustomization.yaml new file mode 100644 index 000000000..bba6090ed --- /dev/null +++ b/config/monitoring/alertmanager/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-alertmanager + app.kubernetes.io/component: alertmanager + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + +configMapGenerator: + - name: alertmanager-config + files: + - config/alertmanager.yml + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/config/monitoring/alertmanager/pvc.yaml b/config/monitoring/alertmanager/pvc.yaml new file mode 100644 index 000000000..c577bf954 --- /dev/null +++ b/config/monitoring/alertmanager/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: alertmanager-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/config/monitoring/alertmanager/service.yaml b/config/monitoring/alertmanager/service.yaml new file mode 100644 index 000000000..a2958c64a --- /dev/null +++ b/config/monitoring/alertmanager/service.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager +spec: + ports: + - name: http + port: 9093 \ No newline at end of file diff --git a/config/monitoring/alertmanager/statefulset.yaml b/config/monitoring/alertmanager/statefulset.yaml new file mode 100644 index 000000000..3dc30efac --- /dev/null +++ b/config/monitoring/alertmanager/statefulset.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: alertmanager +spec: + template: + spec: + containers: + - name: alertmanager + image: prom/alertmanager:latest + args: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + - --log.level=info + - --cluster.advertise-address=0.0.0.0:9093 + livenessProbe: + httpGet: + path: /-/healthy + port: 9093 + initialDelaySeconds: 25 + periodSeconds: 20 + ports: + - containerPort: 9093 + readinessProbe: + httpGet: + path: /-/ready + port: 9093 + volumeMounts: + - mountPath: /etc/alertmanager + name: alertmanager-conf + - mountPath: /alertmanager + name: alertmanager-data + securityContext: + fsGroup: 26 + serviceAccountName: aerospike-monitoring-stack-prometheus + volumes: + - name: alertmanager-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-alertmanager-data + - name: alertmanager-conf + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-alertmanager-config \ No newline at end of file diff --git a/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml new file mode 100644 index 000000000..c7a75fe6d --- /dev/null +++ b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: 'default' + folder: 'Aerospike' + folderUid: 'aerospike1' + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml new file mode 100644 index 000000000..968b2a660 --- /dev/null +++ b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 +datasources: + - name: "Aerospike Prometheus" + type: prometheus + access: proxy + url: http://aerospike-monitoring-stack-prometheus:9090 + editable: true + isDefault: false \ No newline at end of file diff --git a/config/monitoring/grafana/config/download_files.sh b/config/monitoring/grafana/config/download_files.sh new file mode 100644 index 000000000..eaced64e7 --- /dev/null +++ b/config/monitoring/grafana/config/download_files.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +# Check if curl and jq is installed; if not, install curl and jq +if ! command -v curl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then + echo "curl or jq not found. Installing..." + apk add --no-cache curl jq +else + echo "curl and jq are already installed." +fi + +# Define the dashboards to download in the format : or +DASHBOARDS="16119:10 16115:7 20279" + +# Directory where the dashboards will be saved +TARGET_DIR="/mnt/data" +mkdir -p "$TARGET_DIR" + +DELIMITER=':' + +# Loop through each dashboard identifier in DASHBOARDS +for DASHBOARD in $DASHBOARDS; do + if echo "$DASHBOARD" | grep -q "$DELIMITER"; then + # If the delimiter ':' exists, split into ID and REVISION + ID=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f1) + REVISION=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f2) + FILENAME="$ID-rev$REVISION.json" + URL="https://grafana.com/api/dashboards/$ID/revisions/$REVISION/download" + curl -o "$TARGET_DIR/$FILENAME" "$URL" + else + # No delimiter, only the ID is provided + ID="$DASHBOARD" + FILENAME="$ID.json" + URL="https://grafana.com/api/dashboards/$ID" + curl -s "$URL" | jq '.json' > "$TARGET_DIR/$FILENAME" + fi +done + +# List the downloaded files +echo "Downloaded dashboard files:" +ls -l "$TARGET_DIR" \ No newline at end of file diff --git a/config/monitoring/grafana/config/grafana.ini b/config/monitoring/grafana/config/grafana.ini new file mode 100644 index 000000000..bc0daea62 --- /dev/null +++ b/config/monitoring/grafana/config/grafana.ini @@ -0,0 +1,13 @@ +[analytics] + check_for_updates = true + [grafana_net] + url = https://grafana.net + [log] + mode = console + level = debug + [paths] + data = /var/lib/grafana/data + logs = /var/log/grafana + plugins = /var/lib/grafana/plugins + [server] + http_port = 3000 \ No newline at end of file diff --git a/config/monitoring/grafana/kustomization.yaml b/config/monitoring/grafana/kustomization.yaml new file mode 100644 index 000000000..cfd14ad2a --- /dev/null +++ b/config/monitoring/grafana/kustomization.yaml @@ -0,0 +1,30 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-grafana + app.kubernetes.io/component: grafana + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + +configMapGenerator: + - name: grafana-provisioning-datasources + files: + - config/aerospike_grafana_datasource.yaml + - name: grafana-config + files: + - config/grafana.ini + - name: grafana-dashboard-config + files: + - config/aerospike_grafana_dashboards_config.yaml + - name: download-script + files: + - config/download_files.sh + +generatorOptions: + disableNameSuffixHash: true diff --git a/config/monitoring/grafana/pvc.yaml b/config/monitoring/grafana/pvc.yaml new file mode 100644 index 000000000..3bc9acc12 --- /dev/null +++ b/config/monitoring/grafana/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/config/monitoring/grafana/service.yaml b/config/monitoring/grafana/service.yaml new file mode 100644 index 000000000..8068ffb02 --- /dev/null +++ b/config/monitoring/grafana/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 3000 \ No newline at end of file diff --git a/config/monitoring/grafana/statefulset.yaml b/config/monitoring/grafana/statefulset.yaml new file mode 100644 index 000000000..26914c6eb --- /dev/null +++ b/config/monitoring/grafana/statefulset.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: grafana +spec: + replicas: 1 + template: + spec: + serviceAccountName: aerospike-monitoring-stack-prometheus + terminationGracePeriodSeconds: 120 + initContainers: + - name: download-dashboards + image: alpine:latest + command: ["/bin/sh"] + args: [ "-c", "/bin/sh -x /mnt/scripts/download_files.sh" ] + volumeMounts: + - name: dashboards + mountPath: /mnt/data + - name: script-volume + mountPath: /mnt/scripts + containers: + - name: grafana + image: "grafana/grafana:latest" + imagePullPolicy: "IfNotPresent" + volumeMounts: + - name: grafana-config + mountPath: "/etc/grafana/" + - name: grafana-provisioning-datasources + mountPath: "/etc/grafana/provisioning/datasources" + - name: grafana-dashboard-config + mountPath: "/etc/grafana/provisioning/dashboards" + - name: grafana-data + mountPath: "/data" + - name: dashboards + mountPath: "/var/lib/grafana/dashboards" + ports: + - name: service + containerPort: 80 + protocol: TCP + - name: grafana + containerPort: 3000 + protocol: TCP + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + env: + - name: GF_SECURITY_ADMIN_USER + value: "admin" + - name: GF_SECURITY_ADMIN_PASSWORD + value: "admin" + - name: GF_PATHS_DATA + value: /data/grafana/data + securityContext: + fsGroup: 472 + volumes: + - name: grafana-config + configMap: + name: aerospike-monitoring-stack-grafana-config + - name: grafana-provisioning-datasources + configMap: + name: aerospike-monitoring-stack-grafana-provisioning-datasources + - name: grafana-dashboard-config + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-grafana-dashboard-config + - name: script-volume + configMap: + name: aerospike-monitoring-stack-download-script + - name: grafana-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-grafana-data + - name: dashboards + emptyDir: {} \ No newline at end of file diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml new file mode 100644 index 000000000..3499301bb --- /dev/null +++ b/config/monitoring/kustomization.yaml @@ -0,0 +1,16 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring + +labels: + - includeSelectors: false + pairs: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/part-of: aerospike-monitoring-stack + +namePrefix: aerospike-monitoring-stack- + +resources: + - grafana + - prometheus + - alertmanager \ No newline at end of file diff --git a/config/monitoring/prometheus/clusterrole.yaml b/config/monitoring/prometheus/clusterrole.yaml new file mode 100644 index 000000000..6674295f1 --- /dev/null +++ b/config/monitoring/prometheus/clusterrole.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + - statefulsets + - configmaps + - secrets + - services + - nodes + - nodes/metrics + - endpoints + verbs: + - list + - watch + - get + - nonResourceURLs: ["/metrics"] + verbs: ["get"] \ No newline at end of file diff --git a/config/monitoring/prometheus/clusterrolebinding.yaml b/config/monitoring/prometheus/clusterrolebinding.yaml new file mode 100644 index 000000000..2ff72f96b --- /dev/null +++ b/config/monitoring/prometheus/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: aerospike-monitoring-stack-prometheus +subjects: + - kind: ServiceAccount + name: aerospike-monitoring-stack-prometheus + namespace: monitoring \ No newline at end of file diff --git a/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml new file mode 100644 index 000000000..b0e2d7107 --- /dev/null +++ b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml @@ -0,0 +1,566 @@ +groups: + - name: aerospike.rules + rules: + - alert: AerospikeExporterAgentDown + expr: up{job="aerospike"} == 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Aerospike Prometheus exporter job {{ $labels.instance }} down" + description: "{{ $labels.instance }} has been down for more than 30s." + + - alert: AerospikeNodeDown + expr: aerospike_node_up{job="aerospike"} == 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Node {{ $labels.instance }} down" + description: "{{ $labels.instance }} node is down." + + - name: aerospike_aerospike.rules > NAMESPACE + rules: + - alert: NamespaceStopWrites + expr: aerospike_namespace_stop_writes{job="aerospike" } == 1 + for: 30s + labels: + severity: critical + annotations: + summary: "Stop writes for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Used disk space for namespace {{ $labels.ns }} in node {{ $labels.instance }} is above stop writes limit. . namespace view " + + - alert: AerospikeAllFlashAverageObjectsPerSprig + expr: ( ((aerospike_namespace_master_objects { job="aerospike" }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike" } ) and ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike" }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike" }) ))> 50 + for: 30s + labels: + severity: warn + annotations: + summary: "Average Objects per sprig in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeAverageObjectsPerSprig + expr: ( ((aerospike_namespace_master_objects { job="aerospike" }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike" } ) unless ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike" }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike" }) ))> 5000 + for: 30s + labels: + severity: warn + annotations: + summary: "Average Objects per sprig in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeIndexStageSizeWarn + # Check here: https://docs.aerospike.com/reference/configuration#index-stage-size + # <128mb or >4gb -- send warn alert + expr: (aerospike_namespace_index_stage_size{job="aerospike" }>4000000000) + for: 1m + labels: + severity: warn + annotations: + summary: "Index stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Index stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeSIndexStageSizeWarn + # Check here: https://docs.aerospike.com/reference/configuration#sindex-stage-size + # <128mb or >4gb -- send warn alert + expr: (aerospike_namespace_sindex_stage_size{job="aerospike" }>4000000000) + for: 1m + labels: + severity: warn + annotations: + summary: "SIndex stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}" + description: "SIndex stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeIndexPressureDirtyMemoryWarn + # Check here: https://docs.aerospike.com/reference/info#index-pressure + expr: (((aerospike_namespace_index_pressure_dirty_memory{ job="aerospike" })/(aerospike_namespace_index_pressure_total_memory{ job="aerospike" })*100)>10000000) + for: 1m + labels: + severity: warn + annotations: + summary: "Dirty memory ratio against the total memory is above configured limit in node {{ $labels.instance }}" + description: "Dirty memory ration against the total memory is above configured limit in node {{ $labels.instance }}" + + - alert: NamespaceDiskCloseToStopWrites + expr: (aerospike_namespace_device_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to device_available_pct" + description: "device_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit." + + - alert: NamespaceMemoryCloseToStopWrites + expr: (aerospike_namespace_stop_writes_pct{job="aerospike" } - (100 - aerospike_namespace_memory_free_pct{job="aerospike" })) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory " + description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit." + + - alert: NamespacePmemCloseToStopWrites + expr: (aerospike_namespace_pmem_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to pmem_available_pct" + description: "pmem_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit." + + - alert: NamespaceFreeMemoryCloseToStopWrites + expr: (aerospike_namespace_stop_writes_sys_memory_pct{job="aerospike" } - scalar(100 - (aerospike_node_stats_system_free_mem_pct{job="aerospike" }))) <= 10 + for: 30s + labels: + severity: critical + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory" + description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit." + + - alert: ActiveProxies + expr: (increase(aerospike_namespace_client_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_error{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_error{job="aerospike" }[2m])) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Node is doing proxy. Proxies can happen during cluster change / migrations or if there are any network issues." + description: "Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}" + + - alert: NamespaceSupervisorFallingBehind + expr: aerospike_namespace_objects{job="aerospike"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later) + for: 30s + labels: + severity: critical + annotations: + summary: "NSUP is falling behind and/or display the length of time the most recent NSUP cycle lasted" + description: "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ $labels.ns }} in node {{ $labels.instance }}" + + - alert: HwmBreached + expr: aerospike_namespace_hwm_breached{job="aerospike" } == 1 + for: 30s + labels: + severity: warn + annotations: + summary: "High water mark breached for {{ $labels.instance }}/{{ $labels.ns }}" + description: "high-water-disk-pct or high-water-memory-pct has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. Eviction may start to recover disk space." + + - alert: LowDeviceAvailWarning + expr: aerospike_namespace_device_available_pct{job="aerospike" } < 55 + for: 30s + labels: + severity: warn + annotations: + summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: LowDeviceAvailCritical + expr: aerospike_namespace_device_available_pct{job="aerospike" } < 25 + for: 30s + labels: + severity: critical + annotations: + summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: ClientTimeouts + expr: rate(aerospike_namespace_client_read_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1 + for: 1m + labels: + severity: critical + annotations: + summary: "Client transactions are timing out" + description: "Client connections timing out at a rate greater than 1/s. Timeouts can occur during network issues or resource contention on the client and/or server nodes." + + - alert: LowMemoryNamespaceWarning + expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 20 + for: 30s + labels: + severity: warn + annotations: + summary: "Memory available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Memory free has dropped below 20% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: LowMemoryNamespaceCritical + expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 15 + for: 30s + labels: + severity: critical + annotations: + summary: "Memory available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Memory free has dropped below 15% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: DeviceWriteQWarning + expr: aerospike_namespace_storage_engine_device_write_q{job="aerospike" } > 1 + for: 30s + labels: + severity: warn + annotations: + summary: "Device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: ShadowDeviceWriteQWarning + expr: aerospike_namespace_storage_engine_device_shadow_write_q{job="aerospike" } > 1 + for: 30s + labels: + severity: warn + annotations: + summary: "Shadow device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Shadow device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: DeviceDefragQWarning + expr: aerospike_namespace_storage_engine_device_defrag_q{job="aerospike" }> 1000 + for: 5m + labels: + severity: warn + annotations: + summary: "Device defrag queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Device defrag queue has been above 1000 for more than 5m for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: ClockSkewStopWrites + expr: aerospike_namespace_clock_skew_stop_writes{job="aerospike" } == 1 + for: 30s + labels: + severity: critical + annotations: + summary: "Clock skew stop writes" + description: "Clock has skewed for namespace {{ $labels.ns }} in node {{ $labels.instance }}" + + - alert: UnavailablePartitions + expr: aerospike_namespace_unavailable_partitions{job="aerospike" } > 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Some partitions are inaccessible, and roster nodes are missing from the cluster." + description: "Some partitions are not available for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Check for network issues and make sure the cluster forms properly." + + - alert: DeadPartitions + expr: aerospike_namespace_dead_partitions{job="aerospike" } > 2 + for: 30s + labels: + severity: critical + annotations: + summary: "There are unavailable partition, but all roster nodes are present in the cluster." + description: "Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again." + + - alert: NamespaceDataCloseToStopWrites + expr: (aerospike_namespace_data_avail_pct{job="aerospike" } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct" + description: "data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit." + + - alert: LowDataAvailWarning + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 55 + for: 30s + labels: + severity: warn + annotations: + summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: LowDataAvailCritical + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 25 + for: 30s + labels: + severity: critical + annotations: + summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: HighDataUseNamespaceWarning + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 80 + for: 30s + labels: + severity: warn + annotations: + summary: "Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: HighDataUseNamespaceCritical + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 85 + for: 30s + labels: + severity: critical + annotations: + summary: "Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - name: aerospike_aerospike.rules > NODE + rules: + - alert: PrometheusNodeExporterNotPresent + expr: absent(node_cpu_seconds_total) == 1 + for: 30s + labels: + severity: warn + annotations: + summary: " Prometheus Node Exporter is not configured " + description: " Prometheus Node Exporter is not configured in {{ $labels.instance }} " + + - alert: BestPracticesFailure + expr: aerospike_node_stats_failed_best_practices{job="aerospike" } > 0 + for: 30s + labels: + severity: warn + annotations: + summary: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}" + description: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}" + + - alert: ClusterSize + expr: aerospike_node_stats_cluster_size{job="aerospike" } < 3 + for: 30s + labels: + severity: critical + annotations: + summary: "Cluster size lower than expected" + description: "Cluster size mismatch for node {{ $labels.instance }}" + + - alert: ClientConnectionsWarning + expr: aerospike_node_stats_client_connections{job="aerospike" } > 11 + for: 30s + labels: + severity: warn + annotations: + summary: "Client connections warning" + description: "Client connections are greater than 11. Connections will fail if they exceed proto-fd-max." + - alert: ClientConnectionsCritical + expr: aerospike_node_stats_client_connections{job="aerospike" } > 10000 + for: 30s + labels: + severity: critical + annotations: + summary: "Client connections critical" + description: "Client connections are greater than expected peak of 10000." + + - alert: ClientConnectionChurn + expr: rate(aerospike_node_stats_client_connections_opened{job="aerospike" }[1m]) > 100 or rate(aerospike_node_stats_client_connections_closed{job="aerospike" }[1m]) > 100 + for: 1m + labels: + severity: critical + annotations: + summary: "Clients are churning connections at a high rate" + description: "Client connections are being opened or closed at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections." + + - alert: ClockSkewWarning + expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 2000 + for: 30s + labels: + severity: warn + annotations: + summary: "Cluster clock skew warning{" + description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 2000 seconds if nsup-period is non-zero." + + - alert: ClockSkewCritical + expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 20000 + for: 30s + labels: + severity: critical + annotations: + summary: "Cluster clock skew critical alert" + description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 20000 if nsup-period is non-zero." + + - alert: LowMemorySystemWarning + expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 20 + for: 30s + labels: + severity: warn + annotations: + summary: "Memory available warning for {{ $labels.instance }}" + description: "Total memory free has dropped below 20% for node {{ $labels.instance }}." + + - alert: LowMemorySystemCritical + expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 10 + for: 30s + labels: + severity: critical + annotations: + summary: "Memory available critically low for {{ $labels.instance }}" + description: "Total memory free has dropped below 10% for node {{ $labels.instance }}." + + - alert: HeapEfficiencyWarning + #expr: aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60 + expr: (100 - aerospike_node_stats_system_free_mem_pct{job="aerospike" }) > 70 and aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60 + for: 30s + labels: + severity: warn + annotations: + summary: "Heap efficiency warning for {{ $labels.instance }}" + description: "Heap efficiency for node for {{ $labels.instance }} has dropped below 60%." + + - alert: RwInProgressWarning + expr: aerospike_node_stats_rw_in_progress{job="aerospike" }> 100 + for: 30s + labels: + severity: warn + annotations: + summary: "Read/write queue too high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Read/write queue is greater than 100 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - name: aerospike_aerospike.rules > SET + rules: + - alert: pre7x_NamespaceSetQuotaWarning + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: pre7x_NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: NamespaceSetQuotaWarning + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - name: aerospike_aerospike.rules > LATENCIES + rules: + - alert: ReadLatencyP95Warning + expr: histogram_quantile(0.95, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 2 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "95th percentile read latency breached 2ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: ReadLatencyP99Warning + expr: histogram_quantile(0.99, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 4 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance}}" + description: "99th percentile read latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: ReadLatencyP999Warning + expr: histogram_quantile(0.999, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99.9th percentile read latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP95Warning + expr: histogram_quantile(0.95, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 4 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "95th percentile write latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP99Warning + expr: histogram_quantile(0.99, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99th percentile write latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP999Warning + expr: histogram_quantile(0.999, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 64 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99.9th percentile write latency breached 64ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + + - name: aerospike_aerospike.rules > XDR + rules: + + - alert: XDRTimelag + expr: aerospike_xdr_lag{job="aerospike" } > 5 + for: 2m + labels: + severity: warn + annotations: + summary: "XDR lag for namespace {{ $labels.ns }} exceeding 5 second(s) from node {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR lag may be due to network connectivity issues, inability for the source to keep up with incoming writes, or write failures at the destination." + - alert: XDRAbandonedRecords + expr: rate(aerospike_xdr_abandoned{job="aerospike" }[1m]) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Abandoned records detected for XDR on node {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination." + - alert: XDRRetryNoNode + expr: rate(aerospike_xdr_retry_no_node{job="aerospike" }[1m]) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR retries occuring on node {{ $labels.instance }} to DC {{ $labels.dc }} due to unknown master node destination" + description: "XDR cannot determine which destination node is the master." + + - alert: XDRRetryConnReset + expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" }[1m]) > 2 + for: 2m + labels: + severity: warn + annotations: + summary: "Rate of XDR connection resets greater than 2/s from {{ $labels.instance }} to DC {{ $labels.dc }} " + description: "XDR retries occuring due to due to timeouts, network problems, or destination node restarts." + + - alert: XDRRetryDest + expr: rate(aerospike_xdr_retry_dest{job="aerospike" }[1m]) > 5 + for: 2m + labels: + severity: warn + annotations: + summary: "Increase in XDR write retries is greater than 5/s from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR retries due to errors returned by the destination node, u.e. key busy or device overload." + + - alert: XDRLatencyWarning + expr: aerospike_xdr_latency_ms{job="aerospike" } > 100 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR latency above 100ms from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "Network latency between XDR source and destination over the last 30s is higher than expected." + + - alert: XDRLap + expr: aerospike_xdr_lap_us{job="aerospike" } > 75000 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR lap time greater than 75000 microseconds from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "The XDR processing cycle time (lap_us) is approaching the configured period-ms value." + + - alert: XDRRecoveries + expr: increase(aerospike_xdr_recoveries{job="aerospike" }[1m]) > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "XDR recoveries increasing on {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR recoveries happen during reind or may indicate that the in-memory transaction queue is full (the transaction-queue-limit may be too small)." \ No newline at end of file diff --git a/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml new file mode 100644 index 000000000..ce2298672 --- /dev/null +++ b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml @@ -0,0 +1,777 @@ +groups: + - name: node_exporter_alerts + rules: + - alert: HostNodeExporterDownCritical + expr: up{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} " + description: "Failed to scrape {{ $labels.job }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} for more than 1m minutes. node-exporter seems down." + + - alert: HostMemoryFillingUpWarn + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 70 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Memory is filling up (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostMemoryFillingUpCritical + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpWarn + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 70% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpCritical + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 90% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostInodesFillingUpWarn + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.1 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyCritical + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.5 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteLatencyWarn + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.1 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.5 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostCpuStealWarn(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 3% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealWarn(Core) + expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 3%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Core) + expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostNetworkReceiveErrorsWarn + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.8 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.8) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.9 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.9) {{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostClockNotSynchronisingWarn + expr: min_over_time(node_timex_sync_status{job="node-exporter"}[2m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInWarn + expr: (node_vmstat_pswpin{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapInCritical + expr: (node_vmstat_pswpin{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutWarn + expr: (node_vmstat_pswpout{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutCritical + expr: (node_vmstat_pswpout{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + + - alert: HostMemoryFillingUpWarn(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 15%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryFillingUpCritical(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureWarn(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureCritical(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpWarn(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 15% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpCritical(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 30% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostInodesFillingUpWarn(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadLatencyCritical(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyWarn(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostContextSwitchingWarn(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 1000 + for: 1m + labels: + severity: warn + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 1000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostContextSwitchingCritical(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 2000 + for: 1m + labels: + severity: critical + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 2000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkReceiveErrorsWarn(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 80 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInRateWarn + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapInRateCritical + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRateWarn + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRatecritical + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostRateUnusualNetworkThroughputInWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading less data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " diff --git a/config/monitoring/prometheus/config/prometheus.yml b/config/monitoring/prometheus/config/prometheus.yml new file mode 100644 index 000000000..c5b45d07a --- /dev/null +++ b/config/monitoring/prometheus/config/prometheus.yml @@ -0,0 +1,88 @@ +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. +alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + selectors: + - role: pod + label: app.kubernetes.io/component=alertmanager +rule_files: + - "/etc/prometheus/alert-rules.d/aerospike_rules.yml" + - "/etc/prometheus/alert-rules.d/node_exporter_alerts.yml" +scrape_configs: + - job_name: "aerospike-kubernetes-operator" + honor_timestamps: true + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_control_plane, __meta_kubernetes_service_labelpresent_control_plane] + separator: ; + regex: (controller-manager);true + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https + replacement: $1 + action: keep + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + - aerospike +# - job_name: "kubernetes-cadvisor" +# scheme: https +# metrics_path: /metrics/cadvisor +# kubernetes_sd_configs: +# - role: node +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# authorization: +# credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) + - job_name: 'event_exporter' # https://github.com/caicloud/event_exporter/blob/master/deploy/README.md + static_configs: + - targets: ['event-exporter:9102'] + - job_name: 'node-exporter' # https://devopscube.com/node-exporter-kubernetes/ + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [ __meta_kubernetes_endpoints_name ] + regex: 'node-exporter' + action: keep + - job_name: 'aerospike' + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + - aerospike + relabel_configs: + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: keep + regex: aerospike-cluster + replacement: $1 + separator: ; + source_labels: + - __meta_kubernetes_pod_label_app + - action: keep + regex: exporter + replacement: $1 + separator: ; + source_labels: + - __meta_kubernetes_pod_container_port_name \ No newline at end of file diff --git a/config/monitoring/prometheus/kustomization.yaml b/config/monitoring/prometheus/kustomization.yaml new file mode 100644 index 000000000..b75eeaee4 --- /dev/null +++ b/config/monitoring/prometheus/kustomization.yaml @@ -0,0 +1,28 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-prometheus + app.kubernetes.io/component: prometheus + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + - serviceaccount.yaml + - clusterrole.yaml + - clusterrolebinding.yaml + +configMapGenerator: + - name: prometheus-config + files: + - config/prometheus.yml + - name: alert-rules-config + files: + - config/alert-rules/aerospike_rules.yml + - config/alert-rules/node_exporter_alerts.yml + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/config/monitoring/prometheus/pvc.yaml b/config/monitoring/prometheus/pvc.yaml new file mode 100644 index 000000000..d722303c3 --- /dev/null +++ b/config/monitoring/prometheus/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/config/monitoring/prometheus/service.yaml b/config/monitoring/prometheus/service.yaml new file mode 100644 index 000000000..e25d1ac2b --- /dev/null +++ b/config/monitoring/prometheus/service.yaml @@ -0,0 +1,11 @@ +kind: Service +apiVersion: v1 +metadata: + name: prometheus +spec: + ports: + - name: http + port: 9090 + protocol: TCP + targetPort: 9090 + sessionAffinity: ClientIP \ No newline at end of file diff --git a/config/monitoring/prometheus/serviceaccount.yaml b/config/monitoring/prometheus/serviceaccount.yaml new file mode 100644 index 000000000..f671fc5ab --- /dev/null +++ b/config/monitoring/prometheus/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus \ No newline at end of file diff --git a/config/monitoring/prometheus/statefulset.yaml b/config/monitoring/prometheus/statefulset.yaml new file mode 100644 index 000000000..094f9508d --- /dev/null +++ b/config/monitoring/prometheus/statefulset.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus +spec: + replicas: 1 + podManagementPolicy: Parallel + updateStrategy: + type: RollingUpdate + template: + spec: + serviceAccountName: aerospike-monitoring-stack-prometheus + containers: + - name: prometheus-server + image: "prom/prometheus:latest" + imagePullPolicy: "IfNotPresent" + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/data + - --web.listen-address=:9090 + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: prometheus-data + mountPath: /data + - mountPath: /etc/prometheus/alert-rules.d + name: alert-manager-rules + terminationGracePeriodSeconds: 120 + securityContext: + fsGroup: 65534 + volumes: + - name: config-volume + configMap: + name: aerospike-monitoring-stack-prometheus-config + - name: prometheus-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-prometheus-data + - name: alert-manager-rules + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-alert-rules-config \ No newline at end of file diff --git a/helm-charts/aerospike-kubernetes-operator/values.yaml b/helm-charts/aerospike-kubernetes-operator/values.yaml index faea89650..6cd9c0b73 100644 --- a/helm-charts/aerospike-kubernetes-operator/values.yaml +++ b/helm-charts/aerospike-kubernetes-operator/values.yaml @@ -28,7 +28,7 @@ certs: webhookServerCertSecretName: "webhook-server-cert" ## Operator configurations -watchNamespaces: "default" +watchNamespaces: "default,aerospike" # Registry used to pull aerospike-init image aerospikeKubernetesInitRegistry: "docker.io" From fbd9d181c6a06865fc6a97d90c0a33d4b3dfd4df Mon Sep 17 00:00:00 2001 From: Abhisek Dwivedi Date: Tue, 3 Sep 2024 12:29:48 +0530 Subject: [PATCH 2/2] KO-328: Helm charts for AerospikeBackupService, AerospikeBackup and AerospikeRestore CRs (#309) * Added helm charts for backup/restore --- .../aerospike-backup-service/.helmignore | 23 +++++++ .../aerospike-backup-service/Chart.yaml | 17 +++++ .../aerospike-backup-service/README.md | 59 ++++++++++++++++ .../templates/NOTES.txt | 21 ++++++ .../templates/_helpers.tpl | 44 ++++++++++++ .../aerospike-backup-service-cr.yaml | 34 +++++++++ .../templates/serviceaccount.yaml | 13 ++++ .../aerospike-backup-service/values.yaml | 69 +++++++++++++++++++ helm-charts/aerospike-backup/.helmignore | 23 +++++++ helm-charts/aerospike-backup/Chart.yaml | 17 +++++ helm-charts/aerospike-backup/README.md | 56 +++++++++++++++ .../aerospike-backup/templates/NOTES.txt | 19 +++++ .../aerospike-backup/templates/_helpers.tpl | 44 ++++++++++++ .../templates/aerospike-backup-cr.yaml | 23 +++++++ helm-charts/aerospike-backup/values.yaml | 43 ++++++++++++ helm-charts/aerospike-restore/.helmignore | 23 +++++++ helm-charts/aerospike-restore/Chart.yaml | 17 +++++ helm-charts/aerospike-restore/README.md | 55 +++++++++++++++ .../aerospike-restore/templates/NOTES.txt | 19 +++++ .../aerospike-restore/templates/_helpers.tpl | 44 ++++++++++++ .../templates/aerospike-restore-cr.yaml | 24 +++++++ helm-charts/aerospike-restore/values.yaml | 44 ++++++++++++ 22 files changed, 731 insertions(+) create mode 100644 helm-charts/aerospike-backup-service/.helmignore create mode 100644 helm-charts/aerospike-backup-service/Chart.yaml create mode 100644 helm-charts/aerospike-backup-service/README.md create mode 100644 helm-charts/aerospike-backup-service/templates/NOTES.txt create mode 100644 helm-charts/aerospike-backup-service/templates/_helpers.tpl create mode 100644 helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml create mode 100644 helm-charts/aerospike-backup-service/templates/serviceaccount.yaml create mode 100644 helm-charts/aerospike-backup-service/values.yaml create mode 100644 helm-charts/aerospike-backup/.helmignore create mode 100644 helm-charts/aerospike-backup/Chart.yaml create mode 100644 helm-charts/aerospike-backup/README.md create mode 100644 helm-charts/aerospike-backup/templates/NOTES.txt create mode 100644 helm-charts/aerospike-backup/templates/_helpers.tpl create mode 100644 helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml create mode 100644 helm-charts/aerospike-backup/values.yaml create mode 100644 helm-charts/aerospike-restore/.helmignore create mode 100644 helm-charts/aerospike-restore/Chart.yaml create mode 100644 helm-charts/aerospike-restore/README.md create mode 100644 helm-charts/aerospike-restore/templates/NOTES.txt create mode 100644 helm-charts/aerospike-restore/templates/_helpers.tpl create mode 100644 helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml create mode 100644 helm-charts/aerospike-restore/values.yaml diff --git a/helm-charts/aerospike-backup-service/.helmignore b/helm-charts/aerospike-backup-service/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/aerospike-backup-service/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/aerospike-backup-service/Chart.yaml b/helm-charts/aerospike-backup-service/Chart.yaml new file mode 100644 index 000000000..8a774e72f --- /dev/null +++ b/helm-charts/aerospike-backup-service/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +type: application +name: aerospike-backup-service + +# version tracks chart changes +version: 3.3.1 +# appVersion tracks operator version +appVersion: 3.3.1 + +description: A Helm chart for Aerospike Backup Service Custom Resource +icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4 + +sources: + - https://github.com/aerospike/aerospike-kubernetes-operator +maintainers: + - name: Aerospike + email: developers@aerospike.com diff --git a/helm-charts/aerospike-backup-service/README.md b/helm-charts/aerospike-backup-service/README.md new file mode 100644 index 000000000..81a16e16e --- /dev/null +++ b/helm-charts/aerospike-backup-service/README.md @@ -0,0 +1,59 @@ +# Aerospike Backup Service (Custom Resource) Helm Chart + +A Helm chart for `AerospikeBackupService` custom resource to be used with the Aerospike Kubernetes Operator. + +## Pre Requisites + +- Kubernetes 1.19+ +- Aerospike Kubernetes Operator + +## Usage + +### Add Helm Repository + +```sh +helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise +helm repo update +``` + +### Deploy Aerospike Backup Service + +#### Install the chart + +`` used to install Aerospike backup service chart must be included in `watchNamespaces` value of +aerospike-kubernetes-operator's `values.yaml` + +```sh +# helm install --namespace +helm install aerospike-backup-service aerospike/aerospike-backup-service +``` + +It is recommended to create a separate YAML file with configurations as per your requirements and use it +with `helm install`. + +```sh +helm install aerospike-backup-service aerospike/aerospike-backup-service \ + -f +``` + +## Configurations + +| Name | Description | Default | +|------------------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------| +| `image.repository` | Aerospike backup service container image repository | `aerospike.jfrog.io/ecosystem-container-prod-local/aerospike-backup-service` | +| `image.tag` | Aerospike backup service container image tag | `1.0.0` | +| `customLabels` | Custom labels to add on the AerospikeBackupService resource | `{}` (nil) | +| `serviceAccount.create` | Enable ServiceAccount creation for Aerospike backup service. | true | +| `serviceAccount.annotations` | ServiceAccount annotations | `{}` (nil) | +| `backupServiceConfig` | Aerospike backup service configuration | `{}` (nil) | +| `secrets` | Secrets to be mounted in the Aerospike Backup Service pod like aws creds etc. | `[]` (nil) | +| `resources` | Aerospike backup service pod resource requirements | `{}` (nil) | +| `service` | Kubernetes service configuration for Aerospike backup service | `{}` (nil) | + + +### Configurations Explained + +[//]: # (TODO: Update below link when the documentation is available.) +Refer +to [AerospikeBackupService Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec) +for details on above [configuration fields](#Configurations) diff --git a/helm-charts/aerospike-backup-service/templates/NOTES.txt b/helm-charts/aerospike-backup-service/templates/NOTES.txt new file mode 100644 index 000000000..8090fc502 --- /dev/null +++ b/helm-charts/aerospike-backup-service/templates/NOTES.txt @@ -0,0 +1,21 @@ +Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}. +Release Name - {{ .Release.Name }}. + + + /\ + + .' '. * + * /======\ + + ;:. _ ; + |:. (_) | + |:. _ | + + |:. (_) | * + ;:. ; + .' \:. / `. + / .-'':._.'`-. \ + |/ /||\ \| + +Run the following commands to get more information about deployment: + +$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }} +$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }} + +$ kubectl get all --namespace {{ .Release.Namespace }} -l "release={{ .Release.Name }}, chart={{ $.Chart.Name }}" diff --git a/helm-charts/aerospike-backup-service/templates/_helpers.tpl b/helm-charts/aerospike-backup-service/templates/_helpers.tpl new file mode 100644 index 000000000..dc9ee07a1 --- /dev/null +++ b/helm-charts/aerospike-backup-service/templates/_helpers.tpl @@ -0,0 +1,44 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "aerospike-backup-service.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aerospike-backup-service.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Aerospike Backup Service common name. +*/}} +{{- define "aerospike-backup-service.commonName" -}} +{{- if .Values.commonName -}} +{{- .Values.commonName -}} +{{- else -}} +{{- .Release.Name | trunc 63 | replace "-" "" -}} +{{- end -}} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "aerospike-backup-service.selectorLabels" -}} +app.kubernetes.io/name: {{ include "aerospike-backup-service.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "aerospike-backup-service.labels" -}} +helm.sh/chart: {{ include "aerospike-backup-service.chart" . }} +{{ include "aerospike-backup-service.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} \ No newline at end of file diff --git a/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml b/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml new file mode 100644 index 000000000..75b3c2841 --- /dev/null +++ b/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml @@ -0,0 +1,34 @@ +apiVersion: asdb.aerospike.com/v1beta1 +kind: AerospikeBackupService +metadata: + name: {{ template "aerospike-backup-service.commonName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "aerospike-backup-service.labels" . | nindent 4 }} + {{- with .Values.customLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + # Aerospike Backup Service image + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + + # Aerospike Backup Service configuration + config: + {{- .Values.backupServiceConfig | toYaml | nindent 4 }} + + # Secrets to be mounted in the Aerospike Backup Service pod like aws creds etc + {{- with .Values.secrets }} + secrets: {{- toYaml . | nindent 4 }} + {{- end }} + + # Resources for the Aerospike Backup Service pod + {{- if .Values.resources }} + resources: + {{- .Values.resources | toYaml | nindent 4 }} + {{- end }} + + # Kubernetes service configuration for the Aerospike Backup Service + {{- if .Values.service }} + service: + {{- .Values.service | toYaml | nindent 4 }} + {{- end }} diff --git a/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml b/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml new file mode 100644 index 000000000..b62d09c3d --- /dev/null +++ b/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: aerospike-backup-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "aerospike-backup-service.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm-charts/aerospike-backup-service/values.yaml b/helm-charts/aerospike-backup-service/values.yaml new file mode 100644 index 000000000..5b7b7cf0f --- /dev/null +++ b/helm-charts/aerospike-backup-service/values.yaml @@ -0,0 +1,69 @@ +## Default values for aerospike-backup-service. +## This is a YAML-formatted file. +## Declare variables to be passed into your templates. + +## Aerospike Backup Service common name +## Defaults to release name truncated to 63 characters (with hyphens removed) +# commonName: aerobackupservice + +nameOverride: "" + +## Image is the image for the backup service. +image: + repository: aerospike.jfrog.io/ecosystem-container-prod-local/aerospike-backup-service + tag: "1.0.0" + +## Custom labels that will be applied on the AerospikeBackupService resource +customLabels: {} + +## ServiceAccount to be used for the Aerospike Backup Service pod +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + +## Config is the free form configuration for the backup service in YAML format. +## This config is used to start the backup service. The config is passed as a file to the backup service. +backupServiceConfig: {} +# service: +# http: +# port: 8080 +# backup-policies: +# test-policy: +# parallel: 3 +# remove-files: KeepAll +# type: 1 +# storage: +# local: +# path: /localStorage +# type: local +# s3Storage: +# type: aws-s3 +# path: "s3://test-bucket" +# s3-region: us-east-1 +# s3-profile: default + +## SecretMounts is the list of secret to be mounted in the backup service. +secrets: [] +# - secretName: aws-secret +# volumeMount: +# name: aws-secret +# mountPath: /root/.aws/credentials +# subPath: credentials + +## Resources define the requests and limits for the backup service container. +## Resources.Limits should be more than Resources.Requests. +resources: {} +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi + +## Service defines the Kubernetes service configuration for the backup service. +## It is used to expose the backup service deployment. By default, the service type is ClusterIP. +service: {} +# type: ClusterIP + diff --git a/helm-charts/aerospike-backup/.helmignore b/helm-charts/aerospike-backup/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/aerospike-backup/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/aerospike-backup/Chart.yaml b/helm-charts/aerospike-backup/Chart.yaml new file mode 100644 index 000000000..b9f088165 --- /dev/null +++ b/helm-charts/aerospike-backup/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +type: application +name: aerospike-backup + +# version tracks chart changes +version: 3.3.1 +# appVersion tracks operator version +appVersion: 3.3.1 + +description: A Helm chart for Aerospike Backup Custom Resource +icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4 + +sources: + - https://github.com/aerospike/aerospike-kubernetes-operator +maintainers: + - name: Aerospike + email: developers@aerospike.com diff --git a/helm-charts/aerospike-backup/README.md b/helm-charts/aerospike-backup/README.md new file mode 100644 index 000000000..fbc268aaf --- /dev/null +++ b/helm-charts/aerospike-backup/README.md @@ -0,0 +1,56 @@ +# Aerospike Backup (Custom Resource) Helm Chart + +A Helm chart for `AerospikeBackup` custom resource to be used with the Aerospike Kubernetes Operator. + +## Pre Requisites + +- Kubernetes 1.19+ +- Aerospike Kubernetes Operator + +## Usage + +### Add Helm Repository + +```sh +helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise +helm repo update +``` + +### Create Aerospike Backup + +#### Install the chart + +`` used to install Aerospike backup helm chart must be included in `watchNamespaces` value of +aerospike-kubernetes-operator's `values.yaml` + +```sh +# helm install --namespace +helm install aerospike-backup aerospike/aerospike-backup +``` + +It is recommended to create a separate YAML file with configurations as per your requirements and use it +with `helm install`. + +```sh +helm install aerospike-backup aerospike/aerospike-backup \ + -f +``` + +## Configurations + +| Name | Description | Default | +|----------------------------------|------------------------------------------------------|------------| +| `customLabels` | Custom labels to add on the AerospikeBackup resource | `{}` (nil) | +| `backupService.name` | Aerospike backup service name | | +| `backupService.namespace` | Aerospike backup service namespace | | +| `backupConfig` | Aerospike backup configuration | `{}` (nil) | +| `onDemandBackups[*].id` | Unique identifier for the on-demand backup | | +| `onDemandBackups[*].routineName` | Routine name used to trigger on-demand backup | | +| `onDemandBackups[*].delay` | Delay interval before starting the on-demand backup | | + +### Configurations Explained + +[//]: # (TODO: Update below link when the documentation is available.) +Refer +to [AerospikeBackup Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec) +for details on above [configuration fields](#Configurations) diff --git a/helm-charts/aerospike-backup/templates/NOTES.txt b/helm-charts/aerospike-backup/templates/NOTES.txt new file mode 100644 index 000000000..373c78e81 --- /dev/null +++ b/helm-charts/aerospike-backup/templates/NOTES.txt @@ -0,0 +1,19 @@ +Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}. +Release Name - {{ .Release.Name }}. + + + /\ + + .' '. * + * /======\ + + ;:. _ ; + |:. (_) | + |:. _ | + + |:. (_) | * + ;:. ; + .' \:. / `. + / .-'':._.'`-. \ + |/ /||\ \| + +Run the following commands to get more information about deployment: + +$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }} +$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }} \ No newline at end of file diff --git a/helm-charts/aerospike-backup/templates/_helpers.tpl b/helm-charts/aerospike-backup/templates/_helpers.tpl new file mode 100644 index 000000000..d6cdda835 --- /dev/null +++ b/helm-charts/aerospike-backup/templates/_helpers.tpl @@ -0,0 +1,44 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "aerospike-backup.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aerospike-backup.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Aerospike Backup common name. +*/}} +{{- define "aerospike-backup.commonName" -}} +{{- if .Values.commonName -}} +{{- .Values.commonName -}} +{{- else -}} +{{- .Release.Name | trunc 63 | replace "-" "" -}} +{{- end -}} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "aerospike-backup.selectorLabels" -}} +app.kubernetes.io/name: {{ include "aerospike-backup.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "aerospike-backup.labels" -}} +helm.sh/chart: {{ include "aerospike-backup.chart" . }} +{{ include "aerospike-backup.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} \ No newline at end of file diff --git a/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml b/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml new file mode 100644 index 000000000..2290397c2 --- /dev/null +++ b/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml @@ -0,0 +1,23 @@ +apiVersion: asdb.aerospike.com/v1beta1 +kind: AerospikeBackup +metadata: + name: {{ template "aerospike-backup.commonName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "aerospike-backup.labels" . | nindent 4 }} + {{- with .Values.customLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + # Aerospike Backup Service reference + backupService: + {{- .Values.backupService | toYaml | nindent 4 }} + + # Aerospike Backup configuration + config: + {{- .Values.backupConfig | toYaml | nindent 4 }} + + # On-demand backups configuration + {{- with .Values.onDemandBackups }} + onDemandBackups: {{- toYaml . | nindent 4 }} + {{- end }} \ No newline at end of file diff --git a/helm-charts/aerospike-backup/values.yaml b/helm-charts/aerospike-backup/values.yaml new file mode 100644 index 000000000..064aa8196 --- /dev/null +++ b/helm-charts/aerospike-backup/values.yaml @@ -0,0 +1,43 @@ +# Default values for aerospike-backup. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +nameOverride: "" + +## Custom labels that will be applied on the AerospikeBackup resource +customLabels: {} + +## BackupService is the backup service reference i.e. name and namespace. +## It is used to communicate to the backup service to trigger backups. This field is immutable +backupService: {} +# name: aerospikebackupservice +# namespace: aerospike + +## Config is the free form configuration for the backup in YAML format. +## This config is used to trigger backups. It includes: aerospike-cluster, backup-routines +backupConfig: {} +# aerospike-cluster: +# aerospike-aerospikebackup-test-cluster: # Name format: -- +# credentials: +# password: admin123 +# user: admin +# seed-nodes: +# - host-name: aerocluster.aerospike.svc.cluster.local +# port: 3000 +# backup-routines: +# aerospike-aerospikebackup-test-routine: # Name format: -- +# backup-policy: test-policy +# interval-cron: "@daily" +# incr-interval-cron: "@hourly" +# namespaces: ["test"] +# source-cluster: aerospike-aerospikebackup-test-cluster +# storage: local + + +## OnDemandBackups is the configuration for on-demand backups. +onDemandBackups: [] +# - id: on-demand-backup-1 +# routineName: aerospike-aerospikebackup-test-routine +# delay: 10ms + + diff --git a/helm-charts/aerospike-restore/.helmignore b/helm-charts/aerospike-restore/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/aerospike-restore/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/aerospike-restore/Chart.yaml b/helm-charts/aerospike-restore/Chart.yaml new file mode 100644 index 000000000..ef766f049 --- /dev/null +++ b/helm-charts/aerospike-restore/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +type: application +name: aerospike-restore + +# version tracks chart changes +version: 3.3.1 +# appVersion tracks operator version +appVersion: 3.3.1 + +description: A Helm chart for Aerospike Restore Custom Resource +icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4 + +sources: + - https://github.com/aerospike/aerospike-kubernetes-operator +maintainers: + - name: Aerospike + email: developers@aerospike.com diff --git a/helm-charts/aerospike-restore/README.md b/helm-charts/aerospike-restore/README.md new file mode 100644 index 000000000..06966f8cb --- /dev/null +++ b/helm-charts/aerospike-restore/README.md @@ -0,0 +1,55 @@ +# Aerospike Restore (Custom Resource) Helm Chart + +A Helm chart for `AerospikeRestore` custom resource to be used with the Aerospike Kubernetes Operator. + +## Pre Requisites + +- Kubernetes 1.19+ +- Aerospike Kubernetes Operator + +## Usage + +### Add Helm Repository + +```sh +helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise +helm repo update +``` + +### Create Aerospike Restore + +#### Install the chart + +`` used to install Aerospike restore helm chart must be included in `watchNamespaces` value of +aerospike-kubernetes-operator's `values.yaml` + +```sh +# helm install --namespace +helm install aerospike-restore aerospike/aerospike-restore +``` + +It is recommended to create a separate YAML file with configurations as per your requirements and use it +with `helm install`. + +```sh +helm install aerospike-restore aerospike/aerospike-restore \ + -f +``` + +## Configurations + +| Name | Description | Default | +|----------------------|----------------------------------------------------------------------|------------| +| `customLabels` | Custom labels to add on the AerospikeRestore resource | `{}` (nil) | +| `backupService.name` | Aerospike backup service name | | +| `backupService.name` | Aerospike backup service namespace | | +| `type` | Type of restore. It can be of type Full, Incremental, and Timestamp. | `Full` | +| `restoreConfig` | Aerospike restore configuration | `{}` (nil) | +| `pollingPeriod` | Polling period for restore operation status | `60s` | + +### Configurations Explained + +[//]: # (TODO: Update below link when the documentation is available.) +Refer +to [AerospikeRestore Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec) +for details on above [configuration fields](#Configurations) diff --git a/helm-charts/aerospike-restore/templates/NOTES.txt b/helm-charts/aerospike-restore/templates/NOTES.txt new file mode 100644 index 000000000..373c78e81 --- /dev/null +++ b/helm-charts/aerospike-restore/templates/NOTES.txt @@ -0,0 +1,19 @@ +Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}. +Release Name - {{ .Release.Name }}. + + + /\ + + .' '. * + * /======\ + + ;:. _ ; + |:. (_) | + |:. _ | + + |:. (_) | * + ;:. ; + .' \:. / `. + / .-'':._.'`-. \ + |/ /||\ \| + +Run the following commands to get more information about deployment: + +$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }} +$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }} \ No newline at end of file diff --git a/helm-charts/aerospike-restore/templates/_helpers.tpl b/helm-charts/aerospike-restore/templates/_helpers.tpl new file mode 100644 index 000000000..473f4f7fc --- /dev/null +++ b/helm-charts/aerospike-restore/templates/_helpers.tpl @@ -0,0 +1,44 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "aerospike-restore.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aerospike-restore.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Aerospike Restore Service common name. +*/}} +{{- define "aerospike-restore.commonName" -}} +{{- if .Values.commonName -}} +{{- .Values.commonName -}} +{{- else -}} +{{- .Release.Name | trunc 63 | replace "-" "" -}} +{{- end -}} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "aerospike-restore.selectorLabels" -}} +app.kubernetes.io/name: {{ include "aerospike-restore.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "aerospike-restore.labels" -}} +helm.sh/chart: {{ include "aerospike-restore.chart" . }} +{{ include "aerospike-restore.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} \ No newline at end of file diff --git a/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml b/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml new file mode 100644 index 000000000..3b7d3c8f1 --- /dev/null +++ b/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml @@ -0,0 +1,24 @@ +apiVersion: asdb.aerospike.com/v1beta1 +kind: AerospikeRestore +metadata: + name: {{ template "aerospike-restore.commonName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "aerospike-restore.labels" . | nindent 4 }} + {{- with .Values.customLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + # Aerospike Backup Service reference + backupService: + {{- .Values.backupService | toYaml | nindent 4 }} + + # Aerospike Restore type + type: {{ .Values.type }} + + # Aerospike Restore configuration + config: + {{- .Values.restoreConfig | toYaml | nindent 4 }} + + # Polling period for restore operation status + pollingPeriod: {{ .Values.pollingPeriod }} diff --git a/helm-charts/aerospike-restore/values.yaml b/helm-charts/aerospike-restore/values.yaml new file mode 100644 index 000000000..831737553 --- /dev/null +++ b/helm-charts/aerospike-restore/values.yaml @@ -0,0 +1,44 @@ +# Default values for aerospike-restore. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +nameOverride: "" + +## Custom labels that will be applied on the AerospikeRestore resource +customLabels: {} + +## BackupService is the backup service reference i.e. name and namespace. +## It is used to communicate to the backup service to trigger restores. This field is immutable +backupService: {} +# name: aerospikebackupservice +# namespace: aerospike + +## Type is the type of restore. It can be of type Full, Incremental, and Timestamp. +## Based on the restore type, relevant restore config is given. +type: Full + +## Config is the free form configuration for the restore in YAML format. +## This config is used to trigger restores. It includes: destination, policy, source, secret-agent, time and routine. +restoreConfig: {} +# destination: +# label: destinationCluster +# credentials: +# password: admin123 +# user: admin +# seed-nodes: +# - host-name: aerocluster.test.svc.cluster.local +# port: 3000 +# policy: +# parallel: 3 +# no-generation: true +# no-indexes: true +# source: +# "path": "/localStorage/aerospike-aerospikebackup-test-routine/backup/1722326391329/data/test" +# "type": local + + +## Polling period for restore operation status +## It is used to poll the restore service to fetch restore operation status.Default is 60 seconds. +pollingPeriod: 60s + +