diff --git a/config/monitoring/alertmanager/config/alertmanager.yml b/config/monitoring/alertmanager/config/alertmanager.yml new file mode 100644 index 000000000..4908f2377 --- /dev/null +++ b/config/monitoring/alertmanager/config/alertmanager.yml @@ -0,0 +1,12 @@ +# This is an example alertmanager.yml which sends alert notifications to a slack channel. + +global: + slack_api_url: "https://hooks.slack.com/services/TXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX" +route: + group_by: ['cluster', 'service'] + receiver: slack_user + +receivers: + - name: slack_user + slack_configs: + - text: "summary: {{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}" \ No newline at end of file diff --git a/config/monitoring/alertmanager/kustomization.yaml b/config/monitoring/alertmanager/kustomization.yaml new file mode 100644 index 000000000..bba6090ed --- /dev/null +++ b/config/monitoring/alertmanager/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-alertmanager + app.kubernetes.io/component: alertmanager + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + +configMapGenerator: + - name: alertmanager-config + files: + - config/alertmanager.yml + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/config/monitoring/alertmanager/pvc.yaml b/config/monitoring/alertmanager/pvc.yaml new file mode 100644 index 000000000..c577bf954 --- /dev/null +++ b/config/monitoring/alertmanager/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: alertmanager-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/config/monitoring/alertmanager/service.yaml b/config/monitoring/alertmanager/service.yaml new file mode 100644 index 000000000..a2958c64a --- /dev/null +++ b/config/monitoring/alertmanager/service.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager +spec: + ports: + - name: http + port: 9093 \ No newline at end of file diff --git a/config/monitoring/alertmanager/statefulset.yaml b/config/monitoring/alertmanager/statefulset.yaml new file mode 100644 index 000000000..3dc30efac --- /dev/null +++ b/config/monitoring/alertmanager/statefulset.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: alertmanager +spec: + template: + spec: + containers: + - name: alertmanager + image: prom/alertmanager:latest + args: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + - --log.level=info + - --cluster.advertise-address=0.0.0.0:9093 + livenessProbe: + httpGet: + path: /-/healthy + port: 9093 + initialDelaySeconds: 25 + periodSeconds: 20 + ports: + - containerPort: 9093 + readinessProbe: + httpGet: + path: /-/ready + port: 9093 + volumeMounts: + - mountPath: /etc/alertmanager + name: alertmanager-conf + - mountPath: /alertmanager + name: alertmanager-data + securityContext: + fsGroup: 26 + serviceAccountName: aerospike-monitoring-stack-prometheus + volumes: + - name: alertmanager-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-alertmanager-data + - name: alertmanager-conf + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-alertmanager-config \ No newline at end of file diff --git a/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml new file mode 100644 index 000000000..c7a75fe6d --- /dev/null +++ b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: 'default' + folder: 'Aerospike' + folderUid: 'aerospike1' + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards \ No newline at end of file diff --git a/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml new file mode 100644 index 000000000..968b2a660 --- /dev/null +++ b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 +datasources: + - name: "Aerospike Prometheus" + type: prometheus + access: proxy + url: http://aerospike-monitoring-stack-prometheus:9090 + editable: true + isDefault: false \ No newline at end of file diff --git a/config/monitoring/grafana/config/download_files.sh b/config/monitoring/grafana/config/download_files.sh new file mode 100644 index 000000000..eaced64e7 --- /dev/null +++ b/config/monitoring/grafana/config/download_files.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +# Check if curl and jq is installed; if not, install curl and jq +if ! command -v curl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then + echo "curl or jq not found. Installing..." + apk add --no-cache curl jq +else + echo "curl and jq are already installed." +fi + +# Define the dashboards to download in the format : or +DASHBOARDS="16119:10 16115:7 20279" + +# Directory where the dashboards will be saved +TARGET_DIR="/mnt/data" +mkdir -p "$TARGET_DIR" + +DELIMITER=':' + +# Loop through each dashboard identifier in DASHBOARDS +for DASHBOARD in $DASHBOARDS; do + if echo "$DASHBOARD" | grep -q "$DELIMITER"; then + # If the delimiter ':' exists, split into ID and REVISION + ID=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f1) + REVISION=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f2) + FILENAME="$ID-rev$REVISION.json" + URL="https://grafana.com/api/dashboards/$ID/revisions/$REVISION/download" + curl -o "$TARGET_DIR/$FILENAME" "$URL" + else + # No delimiter, only the ID is provided + ID="$DASHBOARD" + FILENAME="$ID.json" + URL="https://grafana.com/api/dashboards/$ID" + curl -s "$URL" | jq '.json' > "$TARGET_DIR/$FILENAME" + fi +done + +# List the downloaded files +echo "Downloaded dashboard files:" +ls -l "$TARGET_DIR" \ No newline at end of file diff --git a/config/monitoring/grafana/config/grafana.ini b/config/monitoring/grafana/config/grafana.ini new file mode 100644 index 000000000..bc0daea62 --- /dev/null +++ b/config/monitoring/grafana/config/grafana.ini @@ -0,0 +1,13 @@ +[analytics] + check_for_updates = true + [grafana_net] + url = https://grafana.net + [log] + mode = console + level = debug + [paths] + data = /var/lib/grafana/data + logs = /var/log/grafana + plugins = /var/lib/grafana/plugins + [server] + http_port = 3000 \ No newline at end of file diff --git a/config/monitoring/grafana/kustomization.yaml b/config/monitoring/grafana/kustomization.yaml new file mode 100644 index 000000000..cfd14ad2a --- /dev/null +++ b/config/monitoring/grafana/kustomization.yaml @@ -0,0 +1,30 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-grafana + app.kubernetes.io/component: grafana + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + +configMapGenerator: + - name: grafana-provisioning-datasources + files: + - config/aerospike_grafana_datasource.yaml + - name: grafana-config + files: + - config/grafana.ini + - name: grafana-dashboard-config + files: + - config/aerospike_grafana_dashboards_config.yaml + - name: download-script + files: + - config/download_files.sh + +generatorOptions: + disableNameSuffixHash: true diff --git a/config/monitoring/grafana/pvc.yaml b/config/monitoring/grafana/pvc.yaml new file mode 100644 index 000000000..3bc9acc12 --- /dev/null +++ b/config/monitoring/grafana/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/config/monitoring/grafana/service.yaml b/config/monitoring/grafana/service.yaml new file mode 100644 index 000000000..8068ffb02 --- /dev/null +++ b/config/monitoring/grafana/service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 3000 \ No newline at end of file diff --git a/config/monitoring/grafana/statefulset.yaml b/config/monitoring/grafana/statefulset.yaml new file mode 100644 index 000000000..26914c6eb --- /dev/null +++ b/config/monitoring/grafana/statefulset.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: grafana +spec: + replicas: 1 + template: + spec: + serviceAccountName: aerospike-monitoring-stack-prometheus + terminationGracePeriodSeconds: 120 + initContainers: + - name: download-dashboards + image: alpine:latest + command: ["/bin/sh"] + args: [ "-c", "/bin/sh -x /mnt/scripts/download_files.sh" ] + volumeMounts: + - name: dashboards + mountPath: /mnt/data + - name: script-volume + mountPath: /mnt/scripts + containers: + - name: grafana + image: "grafana/grafana:latest" + imagePullPolicy: "IfNotPresent" + volumeMounts: + - name: grafana-config + mountPath: "/etc/grafana/" + - name: grafana-provisioning-datasources + mountPath: "/etc/grafana/provisioning/datasources" + - name: grafana-dashboard-config + mountPath: "/etc/grafana/provisioning/dashboards" + - name: grafana-data + mountPath: "/data" + - name: dashboards + mountPath: "/var/lib/grafana/dashboards" + ports: + - name: service + containerPort: 80 + protocol: TCP + - name: grafana + containerPort: 3000 + protocol: TCP + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + env: + - name: GF_SECURITY_ADMIN_USER + value: "admin" + - name: GF_SECURITY_ADMIN_PASSWORD + value: "admin" + - name: GF_PATHS_DATA + value: /data/grafana/data + securityContext: + fsGroup: 472 + volumes: + - name: grafana-config + configMap: + name: aerospike-monitoring-stack-grafana-config + - name: grafana-provisioning-datasources + configMap: + name: aerospike-monitoring-stack-grafana-provisioning-datasources + - name: grafana-dashboard-config + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-grafana-dashboard-config + - name: script-volume + configMap: + name: aerospike-monitoring-stack-download-script + - name: grafana-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-grafana-data + - name: dashboards + emptyDir: {} \ No newline at end of file diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml new file mode 100644 index 000000000..3499301bb --- /dev/null +++ b/config/monitoring/kustomization.yaml @@ -0,0 +1,16 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring + +labels: + - includeSelectors: false + pairs: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/part-of: aerospike-monitoring-stack + +namePrefix: aerospike-monitoring-stack- + +resources: + - grafana + - prometheus + - alertmanager \ No newline at end of file diff --git a/config/monitoring/prometheus/clusterrole.yaml b/config/monitoring/prometheus/clusterrole.yaml new file mode 100644 index 000000000..6674295f1 --- /dev/null +++ b/config/monitoring/prometheus/clusterrole.yaml @@ -0,0 +1,23 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + - statefulsets + - configmaps + - secrets + - services + - nodes + - nodes/metrics + - endpoints + verbs: + - list + - watch + - get + - nonResourceURLs: ["/metrics"] + verbs: ["get"] \ No newline at end of file diff --git a/config/monitoring/prometheus/clusterrolebinding.yaml b/config/monitoring/prometheus/clusterrolebinding.yaml new file mode 100644 index 000000000..2ff72f96b --- /dev/null +++ b/config/monitoring/prometheus/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: aerospike-monitoring-stack-prometheus +subjects: + - kind: ServiceAccount + name: aerospike-monitoring-stack-prometheus + namespace: monitoring \ No newline at end of file diff --git a/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml new file mode 100644 index 000000000..b0e2d7107 --- /dev/null +++ b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml @@ -0,0 +1,566 @@ +groups: + - name: aerospike.rules + rules: + - alert: AerospikeExporterAgentDown + expr: up{job="aerospike"} == 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Aerospike Prometheus exporter job {{ $labels.instance }} down" + description: "{{ $labels.instance }} has been down for more than 30s." + + - alert: AerospikeNodeDown + expr: aerospike_node_up{job="aerospike"} == 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Node {{ $labels.instance }} down" + description: "{{ $labels.instance }} node is down." + + - name: aerospike_aerospike.rules > NAMESPACE + rules: + - alert: NamespaceStopWrites + expr: aerospike_namespace_stop_writes{job="aerospike" } == 1 + for: 30s + labels: + severity: critical + annotations: + summary: "Stop writes for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Used disk space for namespace {{ $labels.ns }} in node {{ $labels.instance }} is above stop writes limit. . namespace view " + + - alert: AerospikeAllFlashAverageObjectsPerSprig + expr: ( ((aerospike_namespace_master_objects { job="aerospike" }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike" } ) and ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike" }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike" }) ))> 50 + for: 30s + labels: + severity: warn + annotations: + summary: "Average Objects per sprig in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeAverageObjectsPerSprig + expr: ( ((aerospike_namespace_master_objects { job="aerospike" }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike" } ) unless ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike" }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike" }) ))> 5000 + for: 30s + labels: + severity: warn + annotations: + summary: "Average Objects per sprig in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeIndexStageSizeWarn + # Check here: https://docs.aerospike.com/reference/configuration#index-stage-size + # <128mb or >4gb -- send warn alert + expr: (aerospike_namespace_index_stage_size{job="aerospike" }>4000000000) + for: 1m + labels: + severity: warn + annotations: + summary: "Index stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}" + description: "Index stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeSIndexStageSizeWarn + # Check here: https://docs.aerospike.com/reference/configuration#sindex-stage-size + # <128mb or >4gb -- send warn alert + expr: (aerospike_namespace_sindex_stage_size{job="aerospike" }>4000000000) + for: 1m + labels: + severity: warn + annotations: + summary: "SIndex stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}" + description: "SIndex stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. " + + - alert: AerospikeIndexPressureDirtyMemoryWarn + # Check here: https://docs.aerospike.com/reference/info#index-pressure + expr: (((aerospike_namespace_index_pressure_dirty_memory{ job="aerospike" })/(aerospike_namespace_index_pressure_total_memory{ job="aerospike" })*100)>10000000) + for: 1m + labels: + severity: warn + annotations: + summary: "Dirty memory ratio against the total memory is above configured limit in node {{ $labels.instance }}" + description: "Dirty memory ration against the total memory is above configured limit in node {{ $labels.instance }}" + + - alert: NamespaceDiskCloseToStopWrites + expr: (aerospike_namespace_device_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to device_available_pct" + description: "device_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit." + + - alert: NamespaceMemoryCloseToStopWrites + expr: (aerospike_namespace_stop_writes_pct{job="aerospike" } - (100 - aerospike_namespace_memory_free_pct{job="aerospike" })) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory " + description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit." + + - alert: NamespacePmemCloseToStopWrites + expr: (aerospike_namespace_pmem_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to pmem_available_pct" + description: "pmem_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit." + + - alert: NamespaceFreeMemoryCloseToStopWrites + expr: (aerospike_namespace_stop_writes_sys_memory_pct{job="aerospike" } - scalar(100 - (aerospike_node_stats_system_free_mem_pct{job="aerospike" }))) <= 10 + for: 30s + labels: + severity: critical + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory" + description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit." + + - alert: ActiveProxies + expr: (increase(aerospike_namespace_client_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_error{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_error{job="aerospike" }[2m])) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Node is doing proxy. Proxies can happen during cluster change / migrations or if there are any network issues." + description: "Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}" + + - alert: NamespaceSupervisorFallingBehind + expr: aerospike_namespace_objects{job="aerospike"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later) + for: 30s + labels: + severity: critical + annotations: + summary: "NSUP is falling behind and/or display the length of time the most recent NSUP cycle lasted" + description: "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ $labels.ns }} in node {{ $labels.instance }}" + + - alert: HwmBreached + expr: aerospike_namespace_hwm_breached{job="aerospike" } == 1 + for: 30s + labels: + severity: warn + annotations: + summary: "High water mark breached for {{ $labels.instance }}/{{ $labels.ns }}" + description: "high-water-disk-pct or high-water-memory-pct has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. Eviction may start to recover disk space." + + - alert: LowDeviceAvailWarning + expr: aerospike_namespace_device_available_pct{job="aerospike" } < 55 + for: 30s + labels: + severity: warn + annotations: + summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: LowDeviceAvailCritical + expr: aerospike_namespace_device_available_pct{job="aerospike" } < 25 + for: 30s + labels: + severity: critical + annotations: + summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: ClientTimeouts + expr: rate(aerospike_namespace_client_read_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1 + for: 1m + labels: + severity: critical + annotations: + summary: "Client transactions are timing out" + description: "Client connections timing out at a rate greater than 1/s. Timeouts can occur during network issues or resource contention on the client and/or server nodes." + + - alert: LowMemoryNamespaceWarning + expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 20 + for: 30s + labels: + severity: warn + annotations: + summary: "Memory available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Memory free has dropped below 20% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: LowMemoryNamespaceCritical + expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 15 + for: 30s + labels: + severity: critical + annotations: + summary: "Memory available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Memory free has dropped below 15% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: DeviceWriteQWarning + expr: aerospike_namespace_storage_engine_device_write_q{job="aerospike" } > 1 + for: 30s + labels: + severity: warn + annotations: + summary: "Device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: ShadowDeviceWriteQWarning + expr: aerospike_namespace_storage_engine_device_shadow_write_q{job="aerospike" } > 1 + for: 30s + labels: + severity: warn + annotations: + summary: "Shadow device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Shadow device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: DeviceDefragQWarning + expr: aerospike_namespace_storage_engine_device_defrag_q{job="aerospike" }> 1000 + for: 5m + labels: + severity: warn + annotations: + summary: "Device defrag queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Device defrag queue has been above 1000 for more than 5m for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - alert: ClockSkewStopWrites + expr: aerospike_namespace_clock_skew_stop_writes{job="aerospike" } == 1 + for: 30s + labels: + severity: critical + annotations: + summary: "Clock skew stop writes" + description: "Clock has skewed for namespace {{ $labels.ns }} in node {{ $labels.instance }}" + + - alert: UnavailablePartitions + expr: aerospike_namespace_unavailable_partitions{job="aerospike" } > 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Some partitions are inaccessible, and roster nodes are missing from the cluster." + description: "Some partitions are not available for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Check for network issues and make sure the cluster forms properly." + + - alert: DeadPartitions + expr: aerospike_namespace_dead_partitions{job="aerospike" } > 2 + for: 30s + labels: + severity: critical + annotations: + summary: "There are unavailable partition, but all roster nodes are present in the cluster." + description: "Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again." + + - alert: NamespaceDataCloseToStopWrites + expr: (aerospike_namespace_data_avail_pct{job="aerospike" } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike" }) <= 10 + for: 30s + labels: + severity: warn + annotations: + summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct" + description: "data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit." + + - alert: LowDataAvailWarning + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 55 + for: 30s + labels: + severity: warn + annotations: + summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: LowDataAvailCritical + expr: aerospike_namespace_data_avail_pct{job="aerospike" } < 25 + for: 30s + labels: + severity: critical + annotations: + summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop." + + - alert: HighDataUseNamespaceWarning + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 80 + for: 30s + labels: + severity: warn + annotations: + summary: "Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - alert: HighDataUseNamespaceCritical + expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 85 + for: 30s + labels: + severity: critical + annotations: + summary: "Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}" + description: "Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity." + + - name: aerospike_aerospike.rules > NODE + rules: + - alert: PrometheusNodeExporterNotPresent + expr: absent(node_cpu_seconds_total) == 1 + for: 30s + labels: + severity: warn + annotations: + summary: " Prometheus Node Exporter is not configured " + description: " Prometheus Node Exporter is not configured in {{ $labels.instance }} " + + - alert: BestPracticesFailure + expr: aerospike_node_stats_failed_best_practices{job="aerospike" } > 0 + for: 30s + labels: + severity: warn + annotations: + summary: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}" + description: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}" + + - alert: ClusterSize + expr: aerospike_node_stats_cluster_size{job="aerospike" } < 3 + for: 30s + labels: + severity: critical + annotations: + summary: "Cluster size lower than expected" + description: "Cluster size mismatch for node {{ $labels.instance }}" + + - alert: ClientConnectionsWarning + expr: aerospike_node_stats_client_connections{job="aerospike" } > 11 + for: 30s + labels: + severity: warn + annotations: + summary: "Client connections warning" + description: "Client connections are greater than 11. Connections will fail if they exceed proto-fd-max." + - alert: ClientConnectionsCritical + expr: aerospike_node_stats_client_connections{job="aerospike" } > 10000 + for: 30s + labels: + severity: critical + annotations: + summary: "Client connections critical" + description: "Client connections are greater than expected peak of 10000." + + - alert: ClientConnectionChurn + expr: rate(aerospike_node_stats_client_connections_opened{job="aerospike" }[1m]) > 100 or rate(aerospike_node_stats_client_connections_closed{job="aerospike" }[1m]) > 100 + for: 1m + labels: + severity: critical + annotations: + summary: "Clients are churning connections at a high rate" + description: "Client connections are being opened or closed at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections." + + - alert: ClockSkewWarning + expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 2000 + for: 30s + labels: + severity: warn + annotations: + summary: "Cluster clock skew warning{" + description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 2000 seconds if nsup-period is non-zero." + + - alert: ClockSkewCritical + expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 20000 + for: 30s + labels: + severity: critical + annotations: + summary: "Cluster clock skew critical alert" + description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 20000 if nsup-period is non-zero." + + - alert: LowMemorySystemWarning + expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 20 + for: 30s + labels: + severity: warn + annotations: + summary: "Memory available warning for {{ $labels.instance }}" + description: "Total memory free has dropped below 20% for node {{ $labels.instance }}." + + - alert: LowMemorySystemCritical + expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 10 + for: 30s + labels: + severity: critical + annotations: + summary: "Memory available critically low for {{ $labels.instance }}" + description: "Total memory free has dropped below 10% for node {{ $labels.instance }}." + + - alert: HeapEfficiencyWarning + #expr: aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60 + expr: (100 - aerospike_node_stats_system_free_mem_pct{job="aerospike" }) > 70 and aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60 + for: 30s + labels: + severity: warn + annotations: + summary: "Heap efficiency warning for {{ $labels.instance }}" + description: "Heap efficiency for node for {{ $labels.instance }} has dropped below 60%." + + - alert: RwInProgressWarning + expr: aerospike_node_stats_rw_in_progress{job="aerospike" }> 100 + for: 30s + labels: + severity: warn + annotations: + summary: "Read/write queue too high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}" + description: "Read/write queue is greater than 100 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys." + + - name: aerospike_aerospike.rules > SET + rules: + - alert: pre7x_NamespaceSetQuotaWarning + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: pre7x_NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: NamespaceSetQuotaWarning + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80 + for: 30s + labels: + severity: warn + annotations: + description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - alert: NamespaceSetQuotaAlertCritical + expr: (((aerospike_sets_data_used_bytes{job="aerospike" } ) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99 + for: 30s + labels: + severity: critical + annotations: + description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}." + summary: "One of your nodes is at % of the quota you have configured on the set." + + - name: aerospike_aerospike.rules > LATENCIES + rules: + - alert: ReadLatencyP95Warning + expr: histogram_quantile(0.95, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 2 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "95th percentile read latency breached 2ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: ReadLatencyP99Warning + expr: histogram_quantile(0.99, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 4 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance}}" + description: "99th percentile read latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: ReadLatencyP999Warning + expr: histogram_quantile(0.999, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99.9th percentile read latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP95Warning + expr: histogram_quantile(0.95, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 4 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "95th percentile write latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP99Warning + expr: histogram_quantile(0.99, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99th percentile write latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + - alert: WriteLatencyP999Warning + expr: histogram_quantile(0.999, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 64 + for: 2m + labels: + severity: warn + annotations: + summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}" + description: "99.9th percentile write latency breached 64ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}." + + + - name: aerospike_aerospike.rules > XDR + rules: + + - alert: XDRTimelag + expr: aerospike_xdr_lag{job="aerospike" } > 5 + for: 2m + labels: + severity: warn + annotations: + summary: "XDR lag for namespace {{ $labels.ns }} exceeding 5 second(s) from node {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR lag may be due to network connectivity issues, inability for the source to keep up with incoming writes, or write failures at the destination." + - alert: XDRAbandonedRecords + expr: rate(aerospike_xdr_abandoned{job="aerospike" }[1m]) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "Abandoned records detected for XDR on node {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination." + - alert: XDRRetryNoNode + expr: rate(aerospike_xdr_retry_no_node{job="aerospike" }[1m]) > 0 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR retries occuring on node {{ $labels.instance }} to DC {{ $labels.dc }} due to unknown master node destination" + description: "XDR cannot determine which destination node is the master." + + - alert: XDRRetryConnReset + expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" }[1m]) > 2 + for: 2m + labels: + severity: warn + annotations: + summary: "Rate of XDR connection resets greater than 2/s from {{ $labels.instance }} to DC {{ $labels.dc }} " + description: "XDR retries occuring due to due to timeouts, network problems, or destination node restarts." + + - alert: XDRRetryDest + expr: rate(aerospike_xdr_retry_dest{job="aerospike" }[1m]) > 5 + for: 2m + labels: + severity: warn + annotations: + summary: "Increase in XDR write retries is greater than 5/s from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR retries due to errors returned by the destination node, u.e. key busy or device overload." + + - alert: XDRLatencyWarning + expr: aerospike_xdr_latency_ms{job="aerospike" } > 100 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR latency above 100ms from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "Network latency between XDR source and destination over the last 30s is higher than expected." + + - alert: XDRLap + expr: aerospike_xdr_lap_us{job="aerospike" } > 75000 + for: 30s + labels: + severity: warn + annotations: + summary: "XDR lap time greater than 75000 microseconds from {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "The XDR processing cycle time (lap_us) is approaching the configured period-ms value." + + - alert: XDRRecoveries + expr: increase(aerospike_xdr_recoveries{job="aerospike" }[1m]) > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "XDR recoveries increasing on {{ $labels.instance }} to DC {{ $labels.dc }}" + description: "XDR recoveries happen during reind or may indicate that the in-memory transaction queue is full (the transaction-queue-limit may be too small)." \ No newline at end of file diff --git a/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml new file mode 100644 index 000000000..ce2298672 --- /dev/null +++ b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml @@ -0,0 +1,777 @@ +groups: + - name: node_exporter_alerts + rules: + - alert: HostNodeExporterDownCritical + expr: up{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} " + description: "Failed to scrape {{ $labels.job }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} for more than 1m minutes. node-exporter seems down." + + - alert: HostMemoryFillingUpWarn + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 70 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Memory is filling up (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostMemoryFillingUpCritical + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpWarn + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 70% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostDiskSpaceFillingUpCritical + expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 90% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostInodesFillingUpWarn + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical + expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.1 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyCritical + expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"}) > 0.5 and (node_disk_reads_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk read latency is increasing (read operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteLatencyWarn + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.1 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical + expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"}) > 0.5 and (node_disk_writes_completed_total{job="node-exporter"}) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Host) + expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationWarn(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70 + for: 30s + labels: + severity: warn + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 70%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostHighCpuUtilizationCritical(Core) + expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90 + for: 30s + labels: + severity: critical + annotations: + summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is crossing (> 90%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostCpuStealWarn(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 3% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Host) + expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealWarn(Core) + expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 3%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostCpuStealCritical(Core) + expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is (> 5%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. " + + - alert: HostNetworkReceiveErrorsWarn + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical + expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical + expr: ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.8 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.8) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical + expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.9 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface is getting overloaded (> 0.9) {{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostClockNotSynchronisingWarn + expr: min_over_time(node_timex_sync_status{job="node-exporter"}[2m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 2m + labels: + severity: warn + annotations: + summary: "Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInWarn + expr: (node_vmstat_pswpin{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapInCritical + expr: (node_vmstat_pswpin{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutWarn + expr: (node_vmstat_pswpout{job="node-exporter"}) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostSwapOutCritical + expr: (node_vmstat_pswpout{job="node-exporter"}) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + + - alert: HostMemoryFillingUpWarn(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 15%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryFillingUpCritical(Rate) + expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Node memory is filling up (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureWarn(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 3 + for: 1m + labels: + severity: warn + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostMemoryUnderMemoryPressureCritical(Rate) + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: critical + annotations: + summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpWarn(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 15 + for: 1m + labels: + severity: warn + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 15% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostDiskSpaceFillingUpCritical(Rate) + expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is crossing (> 30% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostInodesFillingUpWarn(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostInodesFillingUpCritical(Rate) + expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk is running out of available inodes (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadLatencyWarn(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadLatencyCritical(Rate) + expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk Read latency is increasing (read operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyWarn(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskWriteLatencyCritical(Rate) + expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationWarn(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 20% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateCPUutilizationCritical(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU utilization is > 30% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Host) + expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostCpuStealRate(Core) + expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8 + for: 1m + labels: + severity: critical + annotations: + summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}" + description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}" + + - alert: HostContextSwitchingWarn(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 1000 + for: 1m + labels: + severity: warn + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 1000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostContextSwitchingCritical(Rate) + expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 2000 + for: 1m + labels: + severity: critical + annotations: + summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Context switching is increasing (> 2000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkReceiveErrorsWarn(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkReceiveErrorsCritical(Rate) + expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostNetworkTransmitErrorsWarn(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 3 + for: 30s + labels: + severity: warn + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkTransmitErrorsCritical(Rate) + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 5 + for: 30s + labels: + severity: critical + annotations: + summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedWarn(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 80 + for: 1m + labels: + severity: warn + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostNetworkInterfaceSaturatedCritical(Rate) + expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 90 + for: 1m + labels: + severity: critical + annotations: + summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}" + + - alert: HostSwapInRateWarn + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapInRateCritical + expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapIn rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRateWarn + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 5 + for: 1m + labels: + severity: warn + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostSwapOutRatecritical + expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 10 + for: 1m + labels: + severity: critical + annotations: + summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "PageSwapOut rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskReadIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk read IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 300 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Host) + expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 500 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSWarn(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 100 + for: 1m + labels: + severity: warn + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostDiskWriteIOPSCritical(Device) + expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 250 + for: 1m + labels: + severity: critical + annotations: + summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Sustained high disk write IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}." + + - alert: HostRateUnusualNetworkThroughputInWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputInCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutWarn(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 1m + labels: + severity: warn + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostRateUnusualNetworkThroughputOutCritical(Device) + expr: 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 1m + labels: + severity: critical + annotations: + summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Host network interfaces are probably sending data (> 30/ < -30 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}" + + - alert: HostUnusualDiskReadRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading less data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskReadRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably reading data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Host) + expr: 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateWarn(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20 + for: 30s + labels: + severity: warn + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " + + - alert: HostUnusualDiskWriteRateCritical(Device) + expr: 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30 or 100 - (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30 + for: 30s + labels: + severity: critical + annotations: + summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}" + description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} " diff --git a/config/monitoring/prometheus/config/prometheus.yml b/config/monitoring/prometheus/config/prometheus.yml new file mode 100644 index 000000000..c5b45d07a --- /dev/null +++ b/config/monitoring/prometheus/config/prometheus.yml @@ -0,0 +1,88 @@ +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. +alerting: + alertmanagers: + - kubernetes_sd_configs: + - role: pod + selectors: + - role: pod + label: app.kubernetes.io/component=alertmanager +rule_files: + - "/etc/prometheus/alert-rules.d/aerospike_rules.yml" + - "/etc/prometheus/alert-rules.d/node_exporter_alerts.yml" +scrape_configs: + - job_name: "aerospike-kubernetes-operator" + honor_timestamps: true + scrape_interval: 15s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_control_plane, __meta_kubernetes_service_labelpresent_control_plane] + separator: ; + regex: (controller-manager);true + replacement: $1 + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + separator: ; + regex: https + replacement: $1 + action: keep + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + - aerospike +# - job_name: "kubernetes-cadvisor" +# scheme: https +# metrics_path: /metrics/cadvisor +# kubernetes_sd_configs: +# - role: node +# tls_config: +# ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt +# authorization: +# credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token +# relabel_configs: +# - action: labelmap +# regex: __meta_kubernetes_node_label_(.+) + - job_name: 'event_exporter' # https://github.com/caicloud/event_exporter/blob/master/deploy/README.md + static_configs: + - targets: ['event-exporter:9102'] + - job_name: 'node-exporter' # https://devopscube.com/node-exporter-kubernetes/ + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [ __meta_kubernetes_endpoints_name ] + regex: 'node-exporter' + action: keep + - job_name: 'aerospike' + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - default + - aerospike + relabel_configs: + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: keep + regex: aerospike-cluster + replacement: $1 + separator: ; + source_labels: + - __meta_kubernetes_pod_label_app + - action: keep + regex: exporter + replacement: $1 + separator: ; + source_labels: + - __meta_kubernetes_pod_container_port_name \ No newline at end of file diff --git a/config/monitoring/prometheus/kustomization.yaml b/config/monitoring/prometheus/kustomization.yaml new file mode 100644 index 000000000..b75eeaee4 --- /dev/null +++ b/config/monitoring/prometheus/kustomization.yaml @@ -0,0 +1,28 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/name: aerospike-prometheus + app.kubernetes.io/component: prometheus + +resources: + - statefulset.yaml + - pvc.yaml + - service.yaml + - serviceaccount.yaml + - clusterrole.yaml + - clusterrolebinding.yaml + +configMapGenerator: + - name: prometheus-config + files: + - config/prometheus.yml + - name: alert-rules-config + files: + - config/alert-rules/aerospike_rules.yml + - config/alert-rules/node_exporter_alerts.yml + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/config/monitoring/prometheus/pvc.yaml b/config/monitoring/prometheus/pvc.yaml new file mode 100644 index 000000000..d722303c3 --- /dev/null +++ b/config/monitoring/prometheus/pvc.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-data +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi diff --git a/config/monitoring/prometheus/service.yaml b/config/monitoring/prometheus/service.yaml new file mode 100644 index 000000000..e25d1ac2b --- /dev/null +++ b/config/monitoring/prometheus/service.yaml @@ -0,0 +1,11 @@ +kind: Service +apiVersion: v1 +metadata: + name: prometheus +spec: + ports: + - name: http + port: 9090 + protocol: TCP + targetPort: 9090 + sessionAffinity: ClientIP \ No newline at end of file diff --git a/config/monitoring/prometheus/serviceaccount.yaml b/config/monitoring/prometheus/serviceaccount.yaml new file mode 100644 index 000000000..f671fc5ab --- /dev/null +++ b/config/monitoring/prometheus/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus \ No newline at end of file diff --git a/config/monitoring/prometheus/statefulset.yaml b/config/monitoring/prometheus/statefulset.yaml new file mode 100644 index 000000000..094f9508d --- /dev/null +++ b/config/monitoring/prometheus/statefulset.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus +spec: + replicas: 1 + podManagementPolicy: Parallel + updateStrategy: + type: RollingUpdate + template: + spec: + serviceAccountName: aerospike-monitoring-stack-prometheus + containers: + - name: prometheus-server + image: "prom/prometheus:latest" + imagePullPolicy: "IfNotPresent" + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/data + - --web.listen-address=:9090 + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: prometheus-data + mountPath: /data + - mountPath: /etc/prometheus/alert-rules.d + name: alert-manager-rules + terminationGracePeriodSeconds: 120 + securityContext: + fsGroup: 65534 + volumes: + - name: config-volume + configMap: + name: aerospike-monitoring-stack-prometheus-config + - name: prometheus-data + persistentVolumeClaim: + claimName: aerospike-monitoring-stack-prometheus-data + - name: alert-manager-rules + configMap: + defaultMode: 420 + name: aerospike-monitoring-stack-alert-rules-config \ No newline at end of file diff --git a/helm-charts/aerospike-kubernetes-operator/values.yaml b/helm-charts/aerospike-kubernetes-operator/values.yaml index faea89650..6cd9c0b73 100644 --- a/helm-charts/aerospike-kubernetes-operator/values.yaml +++ b/helm-charts/aerospike-kubernetes-operator/values.yaml @@ -28,7 +28,7 @@ certs: webhookServerCertSecretName: "webhook-server-cert" ## Operator configurations -watchNamespaces: "default" +watchNamespaces: "default,aerospike" # Registry used to pull aerospike-init image aerospikeKubernetesInitRegistry: "docker.io"