From 57c8e19eab9d00524c52dc43bfdeee9e54b834c5 Mon Sep 17 00:00:00 2001
From: Tanmay Jain <103629776+tanmayja@users.noreply.github.com>
Date: Wed, 28 Aug 2024 20:34:50 +0530
Subject: [PATCH 1/2] [KO-344] Montoring stack (#300)

* Basic monitoring stack deployment
---
 .../alertmanager/config/alertmanager.yml      |  12 +
 .../alertmanager/kustomization.yaml           |  21 +
 config/monitoring/alertmanager/pvc.yaml       |  10 +
 config/monitoring/alertmanager/service.yaml   |   8 +
 .../monitoring/alertmanager/statefulset.yaml  |  43 +
 .../aerospike_grafana_dashboards_config.yaml  |  11 +
 .../config/aerospike_grafana_datasource.yaml  |   8 +
 .../grafana/config/download_files.sh          |  40 +
 config/monitoring/grafana/config/grafana.ini  |  13 +
 config/monitoring/grafana/kustomization.yaml  |  30 +
 config/monitoring/grafana/pvc.yaml            |  10 +
 config/monitoring/grafana/service.yaml        |  10 +
 config/monitoring/grafana/statefulset.yaml    |  88 ++
 config/monitoring/kustomization.yaml          |  16 +
 config/monitoring/prometheus/clusterrole.yaml |  23 +
 .../prometheus/clusterrolebinding.yaml        |  12 +
 .../config/alert-rules/aerospike_rules.yml    | 566 +++++++++++++
 .../alert-rules/node_exporter_alerts.yml      | 777 ++++++++++++++++++
 .../prometheus/config/prometheus.yml          |  88 ++
 .../monitoring/prometheus/kustomization.yaml  |  28 +
 config/monitoring/prometheus/pvc.yaml         |  10 +
 config/monitoring/prometheus/service.yaml     |  11 +
 .../monitoring/prometheus/serviceaccount.yaml |   4 +
 config/monitoring/prometheus/statefulset.yaml |  61 ++
 .../aerospike-kubernetes-operator/values.yaml |   2 +-
 25 files changed, 1901 insertions(+), 1 deletion(-)
 create mode 100644 config/monitoring/alertmanager/config/alertmanager.yml
 create mode 100644 config/monitoring/alertmanager/kustomization.yaml
 create mode 100644 config/monitoring/alertmanager/pvc.yaml
 create mode 100644 config/monitoring/alertmanager/service.yaml
 create mode 100644 config/monitoring/alertmanager/statefulset.yaml
 create mode 100644 config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml
 create mode 100644 config/monitoring/grafana/config/aerospike_grafana_datasource.yaml
 create mode 100644 config/monitoring/grafana/config/download_files.sh
 create mode 100644 config/monitoring/grafana/config/grafana.ini
 create mode 100644 config/monitoring/grafana/kustomization.yaml
 create mode 100644 config/monitoring/grafana/pvc.yaml
 create mode 100644 config/monitoring/grafana/service.yaml
 create mode 100644 config/monitoring/grafana/statefulset.yaml
 create mode 100644 config/monitoring/kustomization.yaml
 create mode 100644 config/monitoring/prometheus/clusterrole.yaml
 create mode 100644 config/monitoring/prometheus/clusterrolebinding.yaml
 create mode 100644 config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml
 create mode 100644 config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml
 create mode 100644 config/monitoring/prometheus/config/prometheus.yml
 create mode 100644 config/monitoring/prometheus/kustomization.yaml
 create mode 100644 config/monitoring/prometheus/pvc.yaml
 create mode 100644 config/monitoring/prometheus/service.yaml
 create mode 100644 config/monitoring/prometheus/serviceaccount.yaml
 create mode 100644 config/monitoring/prometheus/statefulset.yaml

diff --git a/config/monitoring/alertmanager/config/alertmanager.yml b/config/monitoring/alertmanager/config/alertmanager.yml
new file mode 100644
index 000000000..4908f2377
--- /dev/null
+++ b/config/monitoring/alertmanager/config/alertmanager.yml
@@ -0,0 +1,12 @@
+# This is an example alertmanager.yml which sends alert notifications to a slack channel.
+
+global:
+  slack_api_url: "https://hooks.slack.com/services/TXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+route:
+  group_by: ['cluster', 'service']
+  receiver: slack_user
+
+receivers:
+  - name: slack_user
+    slack_configs:
+        - text: "summary: {{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}"
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/kustomization.yaml b/config/monitoring/alertmanager/kustomization.yaml
new file mode 100644
index 000000000..bba6090ed
--- /dev/null
+++ b/config/monitoring/alertmanager/kustomization.yaml
@@ -0,0 +1,21 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-alertmanager
+      app.kubernetes.io/component: alertmanager
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+
+configMapGenerator:
+  - name: alertmanager-config
+    files:
+      - config/alertmanager.yml
+
+generatorOptions:
+  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/pvc.yaml b/config/monitoring/alertmanager/pvc.yaml
new file mode 100644
index 000000000..c577bf954
--- /dev/null
+++ b/config/monitoring/alertmanager/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: alertmanager-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/service.yaml b/config/monitoring/alertmanager/service.yaml
new file mode 100644
index 000000000..a2958c64a
--- /dev/null
+++ b/config/monitoring/alertmanager/service.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: alertmanager
+spec:
+  ports:
+    - name: http
+      port: 9093
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/statefulset.yaml b/config/monitoring/alertmanager/statefulset.yaml
new file mode 100644
index 000000000..3dc30efac
--- /dev/null
+++ b/config/monitoring/alertmanager/statefulset.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alertmanager
+spec:
+  template:
+    spec:
+      containers:
+        - name: alertmanager
+          image: prom/alertmanager:latest
+          args:
+            - --config.file=/etc/alertmanager/alertmanager.yml
+            - --storage.path=/alertmanager
+            - --log.level=info
+            - --cluster.advertise-address=0.0.0.0:9093
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9093
+            initialDelaySeconds: 25
+            periodSeconds: 20
+          ports:
+            - containerPort: 9093
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9093
+          volumeMounts:
+            - mountPath: /etc/alertmanager
+              name: alertmanager-conf
+            - mountPath: /alertmanager
+              name: alertmanager-data
+      securityContext:
+        fsGroup: 26
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      volumes:
+        - name: alertmanager-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-alertmanager-data
+        - name: alertmanager-conf
+          configMap:
+            defaultMode: 420
+            name: aerospike-monitoring-stack-alertmanager-config
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml
new file mode 100644
index 000000000..c7a75fe6d
--- /dev/null
+++ b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml
@@ -0,0 +1,11 @@
+apiVersion: 1
+providers:
+  - name: 'default'
+    folder: 'Aerospike'
+    folderUid: 'aerospike1'
+    type: file
+    disableDeletion: false
+    editable: true
+    updateIntervalSeconds: 10
+    options:
+      path: /var/lib/grafana/dashboards
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml
new file mode 100644
index 000000000..968b2a660
--- /dev/null
+++ b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml
@@ -0,0 +1,8 @@
+apiVersion: 1
+datasources:
+  - name: "Aerospike Prometheus"
+    type: prometheus
+    access: proxy
+    url: http://aerospike-monitoring-stack-prometheus:9090
+    editable: true
+    isDefault: false
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/download_files.sh b/config/monitoring/grafana/config/download_files.sh
new file mode 100644
index 000000000..eaced64e7
--- /dev/null
+++ b/config/monitoring/grafana/config/download_files.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# Check if curl and jq is installed; if not, install curl and jq
+if ! command -v curl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then
+    echo "curl or jq not found. Installing..."
+    apk add --no-cache curl jq
+else
+    echo "curl and jq are already installed."
+fi
+
+# Define the dashboards to download in the format <dashboard_id>:<revision> or <dashboard_id>
+DASHBOARDS="16119:10 16115:7 20279"
+
+# Directory where the dashboards will be saved
+TARGET_DIR="/mnt/data"
+mkdir -p "$TARGET_DIR"
+
+DELIMITER=':'
+
+# Loop through each dashboard identifier in DASHBOARDS
+for DASHBOARD in $DASHBOARDS; do
+  if echo "$DASHBOARD" | grep -q "$DELIMITER"; then
+    # If the delimiter ':' exists, split into ID and REVISION
+    ID=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f1)
+    REVISION=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f2)
+    FILENAME="$ID-rev$REVISION.json"
+    URL="https://grafana.com/api/dashboards/$ID/revisions/$REVISION/download"
+    curl -o "$TARGET_DIR/$FILENAME" "$URL"
+  else
+    # No delimiter, only the ID is provided
+    ID="$DASHBOARD"
+    FILENAME="$ID.json"
+    URL="https://grafana.com/api/dashboards/$ID"
+    curl -s "$URL" | jq '.json' > "$TARGET_DIR/$FILENAME"
+  fi
+done
+
+# List the downloaded files
+echo "Downloaded dashboard files:"
+ls -l "$TARGET_DIR"
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/grafana.ini b/config/monitoring/grafana/config/grafana.ini
new file mode 100644
index 000000000..bc0daea62
--- /dev/null
+++ b/config/monitoring/grafana/config/grafana.ini
@@ -0,0 +1,13 @@
+[analytics]
+    check_for_updates = true
+    [grafana_net]
+    url = https://grafana.net
+    [log]
+    mode = console
+    level = debug
+    [paths]
+    data = /var/lib/grafana/data
+    logs = /var/log/grafana
+    plugins = /var/lib/grafana/plugins
+    [server]
+    http_port = 3000
\ No newline at end of file
diff --git a/config/monitoring/grafana/kustomization.yaml b/config/monitoring/grafana/kustomization.yaml
new file mode 100644
index 000000000..cfd14ad2a
--- /dev/null
+++ b/config/monitoring/grafana/kustomization.yaml
@@ -0,0 +1,30 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-grafana
+      app.kubernetes.io/component: grafana
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+
+configMapGenerator:
+  - name: grafana-provisioning-datasources
+    files:
+      - config/aerospike_grafana_datasource.yaml
+  - name: grafana-config
+    files:
+      - config/grafana.ini
+  - name: grafana-dashboard-config
+    files:
+      - config/aerospike_grafana_dashboards_config.yaml
+  - name: download-script
+    files:
+      - config/download_files.sh
+
+generatorOptions:
+  disableNameSuffixHash: true
diff --git a/config/monitoring/grafana/pvc.yaml b/config/monitoring/grafana/pvc.yaml
new file mode 100644
index 000000000..3bc9acc12
--- /dev/null
+++ b/config/monitoring/grafana/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
\ No newline at end of file
diff --git a/config/monitoring/grafana/service.yaml b/config/monitoring/grafana/service.yaml
new file mode 100644
index 000000000..8068ffb02
--- /dev/null
+++ b/config/monitoring/grafana/service.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+spec:
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 3000
\ No newline at end of file
diff --git a/config/monitoring/grafana/statefulset.yaml b/config/monitoring/grafana/statefulset.yaml
new file mode 100644
index 000000000..26914c6eb
--- /dev/null
+++ b/config/monitoring/grafana/statefulset.yaml
@@ -0,0 +1,88 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: grafana
+spec:
+  replicas: 1
+  template:
+    spec:
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      terminationGracePeriodSeconds: 120
+      initContainers:
+        - name: download-dashboards
+          image: alpine:latest
+          command: ["/bin/sh"]
+          args: [ "-c", "/bin/sh -x /mnt/scripts/download_files.sh" ]
+          volumeMounts:
+            - name: dashboards
+              mountPath: /mnt/data
+            - name: script-volume
+              mountPath: /mnt/scripts
+      containers:
+        - name: grafana
+          image: "grafana/grafana:latest"
+          imagePullPolicy: "IfNotPresent"
+          volumeMounts:
+            - name: grafana-config
+              mountPath: "/etc/grafana/"
+            - name: grafana-provisioning-datasources
+              mountPath: "/etc/grafana/provisioning/datasources"
+            - name: grafana-dashboard-config
+              mountPath: "/etc/grafana/provisioning/dashboards"
+            - name: grafana-data
+              mountPath: "/data"
+            - name: dashboards
+              mountPath: "/var/lib/grafana/dashboards"
+          ports:
+            - name: service
+              containerPort: 80
+              protocol: TCP
+            - name: grafana
+              containerPort: 3000
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 10
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 10
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              value: "admin"
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              value: "admin"
+            - name: GF_PATHS_DATA
+              value: /data/grafana/data
+      securityContext:
+        fsGroup: 472
+      volumes:
+        - name: grafana-config
+          configMap:
+            name: aerospike-monitoring-stack-grafana-config
+        - name: grafana-provisioning-datasources
+          configMap:
+            name: aerospike-monitoring-stack-grafana-provisioning-datasources
+        - name: grafana-dashboard-config
+          configMap:
+            defaultMode: 420
+            name: aerospike-monitoring-stack-grafana-dashboard-config
+        - name: script-volume
+          configMap:
+            name: aerospike-monitoring-stack-download-script
+        - name: grafana-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-grafana-data
+        - name: dashboards
+          emptyDir: {}
\ No newline at end of file
diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml
new file mode 100644
index 000000000..3499301bb
--- /dev/null
+++ b/config/monitoring/kustomization.yaml
@@ -0,0 +1,16 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: monitoring
+
+labels:
+  - includeSelectors: false
+    pairs:
+      app.kubernetes.io/managed-by: kustomize
+      app.kubernetes.io/part-of: aerospike-monitoring-stack
+
+namePrefix: aerospike-monitoring-stack-
+
+resources:
+  - grafana
+  - prometheus
+  - alertmanager
\ No newline at end of file
diff --git a/config/monitoring/prometheus/clusterrole.yaml b/config/monitoring/prometheus/clusterrole.yaml
new file mode 100644
index 000000000..6674295f1
--- /dev/null
+++ b/config/monitoring/prometheus/clusterrole.yaml
@@ -0,0 +1,23 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - namespaces
+      - pods
+      - statefulsets
+      - configmaps
+      - secrets
+      - services
+      - nodes
+      - nodes/metrics
+      - endpoints
+    verbs:
+      - list
+      - watch
+      - get
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
\ No newline at end of file
diff --git a/config/monitoring/prometheus/clusterrolebinding.yaml b/config/monitoring/prometheus/clusterrolebinding.yaml
new file mode 100644
index 000000000..2ff72f96b
--- /dev/null
+++ b/config/monitoring/prometheus/clusterrolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: aerospike-monitoring-stack-prometheus
+subjects:
+  - kind: ServiceAccount
+    name: aerospike-monitoring-stack-prometheus
+    namespace: monitoring
\ No newline at end of file
diff --git a/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml
new file mode 100644
index 000000000..b0e2d7107
--- /dev/null
+++ b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml
@@ -0,0 +1,566 @@
+groups:
+  - name: aerospike.rules
+    rules:
+      - alert: AerospikeExporterAgentDown
+        expr: up{job="aerospike"} == 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Aerospike Prometheus exporter job {{ $labels.instance }} down"
+          description: "{{ $labels.instance }} has been down for more than 30s."
+
+      - alert: AerospikeNodeDown
+        expr: aerospike_node_up{job="aerospike"} == 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Node {{ $labels.instance }} down"
+          description: "{{ $labels.instance }} node is down."
+
+  - name: aerospike_aerospike.rules > NAMESPACE
+    rules:
+      - alert: NamespaceStopWrites
+        expr: aerospike_namespace_stop_writes{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Stop writes for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Used disk space for namespace {{ $labels.ns }} in node {{ $labels.instance }} is above stop writes limit. . <a href='localhost:7100' target='_blank'>namespace view </a>"
+
+      - alert: AerospikeAllFlashAverageObjectsPerSprig
+        expr:  ( ((aerospike_namespace_master_objects { job="aerospike"  }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike"  } ) and ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike"  }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike"  }) ))> 50
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Average Objects per sprig in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeAverageObjectsPerSprig
+        expr:  ( ((aerospike_namespace_master_objects { job="aerospike"  }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike"  } ) unless ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike"  }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike"  }) ))> 5000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Average Objects per sprig in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeIndexStageSizeWarn
+        # Check here: https://docs.aerospike.com/reference/configuration#index-stage-size
+        #  <128mb or >4gb -- send warn alert
+        expr:  (aerospike_namespace_index_stage_size{job="aerospike" }>4000000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Index stage size configuration is not configured according to documentation in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Index stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeSIndexStageSizeWarn
+        # Check here: https://docs.aerospike.com/reference/configuration#sindex-stage-size
+        #  <128mb or >4gb -- send warn alert
+        expr:  (aerospike_namespace_sindex_stage_size{job="aerospike" }>4000000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "SIndex stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "SIndex stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeIndexPressureDirtyMemoryWarn
+        # Check here: https://docs.aerospike.com/reference/info#index-pressure
+        expr: (((aerospike_namespace_index_pressure_dirty_memory{ job="aerospike"  })/(aerospike_namespace_index_pressure_total_memory{ job="aerospike"  })*100)>10000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Dirty memory ratio against the total memory is above configured limit in node {{ $labels.instance }}"
+          description: "Dirty memory ration against the total memory is above configured limit in node {{ $labels.instance }}"
+
+      - alert: NamespaceDiskCloseToStopWrites
+        expr: (aerospike_namespace_device_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to device_available_pct"
+          description: "device_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit."
+
+      - alert: NamespaceMemoryCloseToStopWrites
+        expr: (aerospike_namespace_stop_writes_pct{job="aerospike" } - (100 - aerospike_namespace_memory_free_pct{job="aerospike" })) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory "
+          description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit."
+
+      - alert: NamespacePmemCloseToStopWrites
+        expr: (aerospike_namespace_pmem_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to pmem_available_pct"
+          description: "pmem_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit."
+
+      - alert: NamespaceFreeMemoryCloseToStopWrites
+        expr: (aerospike_namespace_stop_writes_sys_memory_pct{job="aerospike" } - scalar(100 - (aerospike_node_stats_system_free_mem_pct{job="aerospike" }))) <= 10
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory"
+          description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit."
+
+      - alert: ActiveProxies
+        expr: (increase(aerospike_namespace_client_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_error{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_error{job="aerospike" }[2m])) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Node is doing proxy. Proxies can happen during cluster change / migrations or if there are any network issues."
+          description: "Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}"
+
+      - alert: NamespaceSupervisorFallingBehind
+        expr: aerospike_namespace_objects{job="aerospike"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later)
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "NSUP is falling behind and/or display the length of time the most recent NSUP cycle lasted"
+          description: "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ $labels.ns }} in node {{ $labels.instance }}"
+
+      - alert: HwmBreached
+        expr: aerospike_namespace_hwm_breached{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "High water mark breached for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "high-water-disk-pct or high-water-memory-pct has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. Eviction may start to recover disk space."
+
+      - alert: LowDeviceAvailWarning
+        expr: aerospike_namespace_device_available_pct{job="aerospike" } < 55
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: LowDeviceAvailCritical
+        expr: aerospike_namespace_device_available_pct{job="aerospike" } < 25
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: ClientTimeouts
+        expr: rate(aerospike_namespace_client_read_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Client transactions are timing out"
+          description: "Client connections timing out at a rate greater than 1/s. Timeouts can occur during network issues or resource contention on the client and/or server nodes."
+
+      - alert: LowMemoryNamespaceWarning
+        expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Memory available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Memory free has dropped below 20% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: LowMemoryNamespaceCritical
+        expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 15
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Memory available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Memory free has dropped below 15% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: DeviceWriteQWarning
+        expr: aerospike_namespace_storage_engine_device_write_q{job="aerospike" } > 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: ShadowDeviceWriteQWarning
+        expr: aerospike_namespace_storage_engine_device_shadow_write_q{job="aerospike" } > 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Shadow device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Shadow device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: DeviceDefragQWarning
+        expr: aerospike_namespace_storage_engine_device_defrag_q{job="aerospike" }> 1000
+        for: 5m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device defrag queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Device defrag queue has been above 1000 for more than 5m for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: ClockSkewStopWrites
+        expr: aerospike_namespace_clock_skew_stop_writes{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Clock skew stop writes"
+          description: "Clock has skewed for namespace {{ $labels.ns }} in node {{ $labels.instance }}"
+
+      - alert: UnavailablePartitions
+        expr: aerospike_namespace_unavailable_partitions{job="aerospike" } > 0
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Some partitions are inaccessible, and roster nodes are missing from the cluster."
+          description: "Some partitions are not available for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Check for network issues and make sure the cluster forms properly."
+
+      - alert: DeadPartitions
+        expr: aerospike_namespace_dead_partitions{job="aerospike" } > 2
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "There are unavailable partition, but all roster nodes are present in the cluster."
+          description: "Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again."
+
+      - alert: NamespaceDataCloseToStopWrites
+        expr: (aerospike_namespace_data_avail_pct{job="aerospike"  } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike"  }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct"
+          description: "data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit."
+
+      - alert: LowDataAvailWarning
+        expr: aerospike_namespace_data_avail_pct{job="aerospike"  } < 55
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: LowDataAvailCritical
+        expr: aerospike_namespace_data_avail_pct{job="aerospike"  } < 25
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: HighDataUseNamespaceWarning
+        expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: HighDataUseNamespaceCritical
+        expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 85
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+  - name: aerospike_aerospike.rules > NODE
+    rules:
+      - alert: PrometheusNodeExporterNotPresent
+        expr: absent(node_cpu_seconds_total) == 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: " Prometheus Node Exporter is not configured "
+          description: " Prometheus Node Exporter is not configured in {{ $labels.instance }} "
+
+      - alert: BestPracticesFailure
+        expr: aerospike_node_stats_failed_best_practices{job="aerospike" } > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}"
+          description: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}"
+
+      - alert: ClusterSize
+        expr: aerospike_node_stats_cluster_size{job="aerospike" } < 3
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Cluster size lower than expected"
+          description: "Cluster size mismatch for node {{ $labels.instance }}"
+
+      - alert: ClientConnectionsWarning
+        expr: aerospike_node_stats_client_connections{job="aerospike" } > 11
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Client connections warning"
+          description: "Client connections are greater than 11. Connections will fail if they exceed proto-fd-max."
+      - alert: ClientConnectionsCritical
+        expr: aerospike_node_stats_client_connections{job="aerospike" } > 10000
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Client connections critical"
+          description: "Client connections are greater than expected peak of 10000."
+
+      - alert: ClientConnectionChurn
+        expr: rate(aerospike_node_stats_client_connections_opened{job="aerospike" }[1m]) > 100 or rate(aerospike_node_stats_client_connections_closed{job="aerospike" }[1m]) > 100
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Clients are churning connections at a high rate"
+          description: "Client connections are being opened or closed at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections."
+
+      - alert: ClockSkewWarning
+        expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 2000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Cluster clock skew warning{"
+          description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 2000 seconds if nsup-period is non-zero."
+
+      - alert: ClockSkewCritical
+        expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 20000
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Cluster clock skew critical alert"
+          description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 20000 if nsup-period is non-zero."
+
+      - alert: LowMemorySystemWarning
+        expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Memory available warning for {{ $labels.instance }}"
+          description: "Total memory free has dropped below 20% for node {{ $labels.instance }}."
+
+      - alert: LowMemorySystemCritical
+        expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 10
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Memory available critically low for {{ $labels.instance }}"
+          description: "Total memory free has dropped below 10% for node {{ $labels.instance }}."
+
+      - alert: HeapEfficiencyWarning
+        #expr: aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60
+        expr: (100 - aerospike_node_stats_system_free_mem_pct{job="aerospike" }) > 70 and aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Heap efficiency warning for {{ $labels.instance }}"
+          description: "Heap efficiency for node for {{ $labels.instance }} has dropped below 60%."
+
+      - alert: RwInProgressWarning
+        expr: aerospike_node_stats_rw_in_progress{job="aerospike" }> 100
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read/write queue too high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Read/write queue is greater than 100 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+  - name: aerospike_aerospike.rules > SET
+    rules:
+      - alert: pre7x_NamespaceSetQuotaWarning
+        expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at  % of the quota you have configured on the set."
+
+      - alert: pre7x_NamespaceSetQuotaAlertCritical
+        expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+      - alert: NamespaceSetQuotaWarning
+        expr: (((aerospike_sets_data_used_bytes{job="aerospike"  } ) / (aerospike_sets_stop_writes_size{job="aerospike"  } != 0)) * 100) > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+      - alert: NamespaceSetQuotaAlertCritical
+        expr: (((aerospike_sets_data_used_bytes{job="aerospike"  } ) / (aerospike_sets_stop_writes_size{job="aerospike"  } != 0)) * 100) > 99
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+  - name: aerospike_aerospike.rules > LATENCIES
+    rules:
+      - alert: ReadLatencyP95Warning
+        expr: histogram_quantile(0.95, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 2
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "95th percentile read latency breached 2ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: ReadLatencyP99Warning
+        expr: histogram_quantile(0.99, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 4
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance}}"
+          description: "99th percentile read latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: ReadLatencyP999Warning
+        expr: histogram_quantile(0.999, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99.9th percentile read latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP95Warning
+        expr: histogram_quantile(0.95, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 4
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "95th percentile write latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP99Warning
+        expr: histogram_quantile(0.99, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99th percentile write latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP999Warning
+        expr: histogram_quantile(0.999, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 64
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99.9th percentile write latency breached 64ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+
+  - name: aerospike_aerospike.rules > XDR
+    rules:
+
+      - alert: XDRTimelag
+        expr: aerospike_xdr_lag{job="aerospike" } > 5
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR lag for namespace {{ $labels.ns }} exceeding 5 second(s) from node {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR lag may be due to network connectivity issues, inability for the source to keep up with incoming writes, or write failures at the destination."
+      - alert: XDRAbandonedRecords
+        expr: rate(aerospike_xdr_abandoned{job="aerospike" }[1m]) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Abandoned records detected for XDR on node {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination."
+      - alert: XDRRetryNoNode
+        expr: rate(aerospike_xdr_retry_no_node{job="aerospike" }[1m]) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR retries occuring on node {{ $labels.instance }} to DC {{ $labels.dc }} due to unknown master node destination"
+          description: "XDR cannot determine which destination node is the master."
+
+      - alert: XDRRetryConnReset
+        expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" }[1m]) > 2
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Rate of XDR connection resets greater than 2/s from {{ $labels.instance }} to DC {{ $labels.dc }} "
+          description: "XDR retries occuring due to due to timeouts, network problems, or destination node restarts."
+
+      - alert: XDRRetryDest
+        expr: rate(aerospike_xdr_retry_dest{job="aerospike" }[1m]) > 5
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Increase in XDR write retries is greater than 5/s from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR retries due to errors returned by the destination node, u.e. key busy or device overload."
+
+      - alert: XDRLatencyWarning
+        expr: aerospike_xdr_latency_ms{job="aerospike" } > 100
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR latency above 100ms from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "Network latency between XDR source and destination over the last 30s is higher than expected."
+
+      - alert: XDRLap
+        expr: aerospike_xdr_lap_us{job="aerospike" } > 75000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR lap time greater than 75000 microseconds from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "The XDR processing cycle time (lap_us) is approaching the configured period-ms value."
+
+      - alert: XDRRecoveries
+        expr: increase(aerospike_xdr_recoveries{job="aerospike" }[1m]) > 0
+        for:  2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "XDR recoveries increasing on {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR recoveries happen during reind or may indicate that the in-memory transaction queue is full (the transaction-queue-limit may be too small)."
\ No newline at end of file
diff --git a/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml
new file mode 100644
index 000000000..ce2298672
--- /dev/null
+++ b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml
@@ -0,0 +1,777 @@
+groups:
+  - name: node_exporter_alerts
+    rules:
+      - alert: HostNodeExporterDownCritical
+        expr: up{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} "
+          description: "Failed to scrape {{ $labels.job }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} for more than 1m minutes. node-exporter seems down."
+
+      - alert: HostMemoryFillingUpWarn
+        expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 70
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Memory is filling up (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostMemoryFillingUpCritical
+        expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 90
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostDiskSpaceFillingUpWarn
+        expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 70% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostDiskSpaceFillingUpCritical
+        expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 90% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpWarn
+        expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpCritical
+        expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyWarn
+        expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"})  > 0.1  and (node_disk_reads_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk read latency is increasing (read operations > 0.1s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyCritical
+        expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"})  > 0.5  and (node_disk_reads_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk read latency is increasing (read operations > 0.5s) on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteLatencyWarn
+        expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"})  > 0.1 and (node_disk_writes_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyCritical
+        expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"})  > 0.5 and (node_disk_writes_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationWarn(Host)
+        expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationCritical(Host)
+        expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationWarn(Core)
+        expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 70%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationCritical(Core)
+        expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 90%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealWarn(Host)
+        expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 3% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealCritical(Host)
+        expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 5%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealWarn(Core)
+        expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 3%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealCritical(Core)
+        expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 5%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostNetworkReceiveErrorsWarn
+        expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostNetworkReceiveErrorsCritical
+        expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostNetworkTransmitErrorsWarn
+        expr:  ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostNetworkTransmitErrorsCritical
+        expr:  ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedWarn
+        expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.8
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface is getting overloaded (> 0.8) on host  {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}.  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedCritical
+        expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.9
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface is getting overloaded (> 0.9) {{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostClockNotSynchronisingWarn
+        expr: min_over_time(node_timex_sync_status{job="node-exporter"}[2m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostSwapInWarn
+        expr: (node_vmstat_pswpin{job="node-exporter"})  > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 5 on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapInCritical
+        expr: (node_vmstat_pswpin{job="node-exporter"})  > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapOutWarn
+        expr: (node_vmstat_pswpout{job="node-exporter"}) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds 5 on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapOutCritical
+        expr: (node_vmstat_pswpout{job="node-exporter"}) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+
+      - alert: HostMemoryFillingUpWarn(Rate)
+        expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 15
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 15%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryFillingUpCritical(Rate)
+        expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryUnderMemoryPressureWarn(Rate)
+        expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryUnderMemoryPressureCritical(Rate)
+        expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostDiskSpaceFillingUpWarn(Rate)
+        expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 15
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 15% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostDiskSpaceFillingUpCritical(Rate)
+        expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 30% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostInodesFillingUpWarn(Rate)
+        expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpCritical(Rate)
+        expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 30%)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyWarn(Rate)
+        expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk Read latency is increasing (read operations > 0.05s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskReadLatencyCritical(Rate)
+        expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk Read latency is increasing (read operations > 0.1s)   on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyWarn(Rate)
+        expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyCritical(Rate)
+        expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.1s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationWarn(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 20%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationCritical(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 30%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationWarn(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 20%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationCritical(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 30%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostContextSwitchingWarn(Rate)
+        expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 1000
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Context switching is increasing (> 1000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostContextSwitchingCritical(Rate)
+        expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 2000
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Context switching is increasing (> 2000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostNetworkReceiveErrorsWarn(Rate)
+        expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostNetworkReceiveErrorsCritical(Rate)
+        expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostNetworkTransmitErrorsWarn(Rate)
+        expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostNetworkTransmitErrorsCritical(Rate)
+        expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedWarn(Rate)
+        expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 80
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedCritical(Rate)
+        expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 90
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostSwapInRateWarn
+        expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapInRateCritical
+        expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapOutRateWarn
+        expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapOutRatecritical
+        expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostDiskReadIOPSWarn(Host)
+        expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 300
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSCritical(Host)
+        expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 500
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSWarn(Device)
+        expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 100
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSCritical(Device)
+        expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 250
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSWarn(Host)
+        expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 300
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSCritical(Host)
+        expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 500
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSWarn(Device)
+        expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 100
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSCritical(Device)
+        expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 250
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostRateUnusualNetworkThroughputInWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 20  or  100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 20/ < -20%)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 20  or  100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 20/ < -20 %)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  > 30 or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 30/ < -30 %)  on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutWarn(Device)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutCritical(Device)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 30/ < -30 %)  on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskReadRateWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20  or 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading less data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or  100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
diff --git a/config/monitoring/prometheus/config/prometheus.yml b/config/monitoring/prometheus/config/prometheus.yml
new file mode 100644
index 000000000..c5b45d07a
--- /dev/null
+++ b/config/monitoring/prometheus/config/prometheus.yml
@@ -0,0 +1,88 @@
+global:
+  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+alerting:
+  alertmanagers:
+    - kubernetes_sd_configs:
+        - role: pod
+          selectors:
+            - role: pod
+              label: app.kubernetes.io/component=alertmanager
+rule_files:
+  - "/etc/prometheus/alert-rules.d/aerospike_rules.yml"
+  - "/etc/prometheus/alert-rules.d/node_exporter_alerts.yml"
+scrape_configs:
+  - job_name: "aerospike-kubernetes-operator"
+    honor_timestamps: true
+    scrape_interval: 15s
+    scrape_timeout: 10s
+    metrics_path: /metrics
+    scheme: https
+    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+    tls_config:
+      insecure_skip_verify: true
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_service_label_control_plane, __meta_kubernetes_service_labelpresent_control_plane]
+        separator: ;
+        regex: (controller-manager);true
+        replacement: $1
+        action: keep
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        separator: ;
+        regex: https
+        replacement: $1
+        action: keep
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names:
+            - default
+            - aerospike
+#  - job_name: "kubernetes-cadvisor"
+#    scheme: https
+#    metrics_path: /metrics/cadvisor
+#    kubernetes_sd_configs:
+#      - role: node
+#    tls_config:
+#      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#    authorization:
+#      credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+#    relabel_configs:
+#      - action: labelmap
+#        regex: __meta_kubernetes_node_label_(.+)
+  - job_name: 'event_exporter' # https://github.com/caicloud/event_exporter/blob/master/deploy/README.md
+    static_configs:
+      - targets: ['event-exporter:9102']
+  - job_name: 'node-exporter' # https://devopscube.com/node-exporter-kubernetes/
+    kubernetes_sd_configs:
+      - role: endpoints
+    relabel_configs:
+      - source_labels: [ __meta_kubernetes_endpoints_name ]
+        regex: 'node-exporter'
+        action: keep
+  - job_name: 'aerospike'
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names:
+            - default
+            - aerospike
+    relabel_configs:
+      - source_labels:
+          - __meta_kubernetes_namespace
+        target_label: namespace
+      - source_labels:
+          - __meta_kubernetes_pod_name
+        target_label: pod
+      - action: keep
+        regex: aerospike-cluster
+        replacement: $1
+        separator: ;
+        source_labels:
+          - __meta_kubernetes_pod_label_app
+      - action: keep
+        regex: exporter
+        replacement: $1
+        separator: ;
+        source_labels:
+          - __meta_kubernetes_pod_container_port_name
\ No newline at end of file
diff --git a/config/monitoring/prometheus/kustomization.yaml b/config/monitoring/prometheus/kustomization.yaml
new file mode 100644
index 000000000..b75eeaee4
--- /dev/null
+++ b/config/monitoring/prometheus/kustomization.yaml
@@ -0,0 +1,28 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-prometheus
+      app.kubernetes.io/component: prometheus
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+  - serviceaccount.yaml
+  - clusterrole.yaml
+  - clusterrolebinding.yaml
+
+configMapGenerator:
+  - name: prometheus-config
+    files:
+      - config/prometheus.yml
+  - name: alert-rules-config
+    files:
+      - config/alert-rules/aerospike_rules.yml
+      - config/alert-rules/node_exporter_alerts.yml
+
+generatorOptions:
+  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/config/monitoring/prometheus/pvc.yaml b/config/monitoring/prometheus/pvc.yaml
new file mode 100644
index 000000000..d722303c3
--- /dev/null
+++ b/config/monitoring/prometheus/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
diff --git a/config/monitoring/prometheus/service.yaml b/config/monitoring/prometheus/service.yaml
new file mode 100644
index 000000000..e25d1ac2b
--- /dev/null
+++ b/config/monitoring/prometheus/service.yaml
@@ -0,0 +1,11 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: prometheus
+spec:
+  ports:
+    - name: http
+      port: 9090
+      protocol: TCP
+      targetPort: 9090
+  sessionAffinity: ClientIP
\ No newline at end of file
diff --git a/config/monitoring/prometheus/serviceaccount.yaml b/config/monitoring/prometheus/serviceaccount.yaml
new file mode 100644
index 000000000..f671fc5ab
--- /dev/null
+++ b/config/monitoring/prometheus/serviceaccount.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
\ No newline at end of file
diff --git a/config/monitoring/prometheus/statefulset.yaml b/config/monitoring/prometheus/statefulset.yaml
new file mode 100644
index 000000000..094f9508d
--- /dev/null
+++ b/config/monitoring/prometheus/statefulset.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+spec:
+  replicas: 1
+  podManagementPolicy: Parallel
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    spec:
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      containers:
+        - name: prometheus-server
+          image: "prom/prometheus:latest"
+          imagePullPolicy: "IfNotPresent"
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/data
+            - --web.listen-address=:9090
+          ports:
+            - containerPort: 9090
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/prometheus
+            - name: prometheus-data
+              mountPath: /data
+            - mountPath: /etc/prometheus/alert-rules.d
+              name: alert-manager-rules
+      terminationGracePeriodSeconds: 120
+      securityContext:
+        fsGroup: 65534
+      volumes:
+        - name: config-volume
+          configMap:
+            name: aerospike-monitoring-stack-prometheus-config
+        - name: prometheus-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-prometheus-data
+        - name: alert-manager-rules
+          configMap:
+              defaultMode: 420
+              name: aerospike-monitoring-stack-alert-rules-config
\ No newline at end of file
diff --git a/helm-charts/aerospike-kubernetes-operator/values.yaml b/helm-charts/aerospike-kubernetes-operator/values.yaml
index faea89650..6cd9c0b73 100644
--- a/helm-charts/aerospike-kubernetes-operator/values.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/values.yaml
@@ -28,7 +28,7 @@ certs:
   webhookServerCertSecretName: "webhook-server-cert"
 
 ##  Operator configurations
-watchNamespaces: "default"
+watchNamespaces: "default,aerospike"
 
 # Registry used to pull aerospike-init image
 aerospikeKubernetesInitRegistry: "docker.io"

From fbd9d181c6a06865fc6a97d90c0a33d4b3dfd4df Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Tue, 3 Sep 2024 12:29:48 +0530
Subject: [PATCH 2/2] KO-328: Helm charts for AerospikeBackupService,
 AerospikeBackup and AerospikeRestore CRs (#309)

* Added helm charts for backup/restore
---
 .../aerospike-backup-service/.helmignore      | 23 +++++++
 .../aerospike-backup-service/Chart.yaml       | 17 +++++
 .../aerospike-backup-service/README.md        | 59 ++++++++++++++++
 .../templates/NOTES.txt                       | 21 ++++++
 .../templates/_helpers.tpl                    | 44 ++++++++++++
 .../aerospike-backup-service-cr.yaml          | 34 +++++++++
 .../templates/serviceaccount.yaml             | 13 ++++
 .../aerospike-backup-service/values.yaml      | 69 +++++++++++++++++++
 helm-charts/aerospike-backup/.helmignore      | 23 +++++++
 helm-charts/aerospike-backup/Chart.yaml       | 17 +++++
 helm-charts/aerospike-backup/README.md        | 56 +++++++++++++++
 .../aerospike-backup/templates/NOTES.txt      | 19 +++++
 .../aerospike-backup/templates/_helpers.tpl   | 44 ++++++++++++
 .../templates/aerospike-backup-cr.yaml        | 23 +++++++
 helm-charts/aerospike-backup/values.yaml      | 43 ++++++++++++
 helm-charts/aerospike-restore/.helmignore     | 23 +++++++
 helm-charts/aerospike-restore/Chart.yaml      | 17 +++++
 helm-charts/aerospike-restore/README.md       | 55 +++++++++++++++
 .../aerospike-restore/templates/NOTES.txt     | 19 +++++
 .../aerospike-restore/templates/_helpers.tpl  | 44 ++++++++++++
 .../templates/aerospike-restore-cr.yaml       | 24 +++++++
 helm-charts/aerospike-restore/values.yaml     | 44 ++++++++++++
 22 files changed, 731 insertions(+)
 create mode 100644 helm-charts/aerospike-backup-service/.helmignore
 create mode 100644 helm-charts/aerospike-backup-service/Chart.yaml
 create mode 100644 helm-charts/aerospike-backup-service/README.md
 create mode 100644 helm-charts/aerospike-backup-service/templates/NOTES.txt
 create mode 100644 helm-charts/aerospike-backup-service/templates/_helpers.tpl
 create mode 100644 helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml
 create mode 100644 helm-charts/aerospike-backup-service/templates/serviceaccount.yaml
 create mode 100644 helm-charts/aerospike-backup-service/values.yaml
 create mode 100644 helm-charts/aerospike-backup/.helmignore
 create mode 100644 helm-charts/aerospike-backup/Chart.yaml
 create mode 100644 helm-charts/aerospike-backup/README.md
 create mode 100644 helm-charts/aerospike-backup/templates/NOTES.txt
 create mode 100644 helm-charts/aerospike-backup/templates/_helpers.tpl
 create mode 100644 helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml
 create mode 100644 helm-charts/aerospike-backup/values.yaml
 create mode 100644 helm-charts/aerospike-restore/.helmignore
 create mode 100644 helm-charts/aerospike-restore/Chart.yaml
 create mode 100644 helm-charts/aerospike-restore/README.md
 create mode 100644 helm-charts/aerospike-restore/templates/NOTES.txt
 create mode 100644 helm-charts/aerospike-restore/templates/_helpers.tpl
 create mode 100644 helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml
 create mode 100644 helm-charts/aerospike-restore/values.yaml

diff --git a/helm-charts/aerospike-backup-service/.helmignore b/helm-charts/aerospike-backup-service/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm-charts/aerospike-backup-service/Chart.yaml b/helm-charts/aerospike-backup-service/Chart.yaml
new file mode 100644
index 000000000..8a774e72f
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/Chart.yaml
@@ -0,0 +1,17 @@
+apiVersion: v2
+type: application
+name: aerospike-backup-service
+
+# version tracks chart changes
+version: 3.3.1
+# appVersion tracks operator version
+appVersion: 3.3.1
+
+description: A Helm chart for Aerospike Backup Service Custom Resource
+icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4
+
+sources:
+  - https://github.com/aerospike/aerospike-kubernetes-operator
+maintainers:
+  - name: Aerospike
+    email: developers@aerospike.com
diff --git a/helm-charts/aerospike-backup-service/README.md b/helm-charts/aerospike-backup-service/README.md
new file mode 100644
index 000000000..81a16e16e
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/README.md
@@ -0,0 +1,59 @@
+# Aerospike Backup Service (Custom Resource) Helm Chart
+
+A Helm chart for `AerospikeBackupService` custom resource to be used with the Aerospike Kubernetes Operator.
+
+## Pre Requisites
+
+- Kubernetes 1.19+
+- Aerospike Kubernetes Operator
+
+## Usage
+
+### Add Helm Repository
+
+```sh
+helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise
+helm repo update
+```
+
+### Deploy Aerospike Backup Service
+
+#### Install the chart
+
+`<namespace>` used to install Aerospike backup service chart must be included in `watchNamespaces` value of
+aerospike-kubernetes-operator's `values.yaml`
+
+```sh
+# helm install <chartName> <chartPath> --namespace <namespace>
+helm install aerospike-backup-service aerospike/aerospike-backup-service
+```
+
+It is recommended to create a separate YAML file with configurations as per your requirements and use it
+with `helm install`.
+
+```sh
+helm install aerospike-backup-service aerospike/aerospike-backup-service \
+    -f <customized-values-yaml-file>
+```
+
+## Configurations
+
+| Name                         | Description                                                                   | Default                                                                      |
+|------------------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------|
+| `image.repository`           | Aerospike backup service container image repository                           | `aerospike.jfrog.io/ecosystem-container-prod-local/aerospike-backup-service` |
+| `image.tag`                  | Aerospike backup service container image tag                                  | `1.0.0`                                                                      |
+| `customLabels`               | Custom labels to add on the AerospikeBackupService resource                   | `{}` (nil)                                                                   |
+| `serviceAccount.create`      | Enable ServiceAccount creation for Aerospike backup service.                  | true                                                                         |
+| `serviceAccount.annotations` | ServiceAccount annotations                                                    | `{}` (nil)                                                                   |
+| `backupServiceConfig`        | Aerospike backup service configuration                                        | `{}` (nil)                                                                   |
+| `secrets`                    | Secrets to be mounted in the Aerospike Backup Service pod like aws creds etc. | `[]` (nil)                                                                   |
+| `resources`                  | Aerospike backup service pod resource requirements                            | `{}` (nil)                                                                   |
+| `service`                    | Kubernetes service configuration for Aerospike backup service                 | `{}` (nil)                                                                   |
+
+
+### Configurations Explained
+
+[//]: # (TODO: Update below link when the documentation is available.)
+Refer
+to [AerospikeBackupService Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec)
+for details on above [configuration fields](#Configurations)
diff --git a/helm-charts/aerospike-backup-service/templates/NOTES.txt b/helm-charts/aerospike-backup-service/templates/NOTES.txt
new file mode 100644
index 000000000..8090fc502
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/templates/NOTES.txt
@@ -0,0 +1,21 @@
+Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}.
+Release Name - {{ .Release.Name }}.
+
+                      +   /\
+         +              .'  '.   *
+                *      /======\      +
+                      ;:.  _   ;
+                      |:. (_)  |
+                      |:.  _   |
+            +         |:. (_)  |          *
+                      ;:.      ;
+                    .' \:.    / `.
+                   / .-'':._.'`-. \
+                   |/    /||\    \|
+
+Run the following commands to get more information about deployment:
+
+$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }}
+$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }}
+
+$ kubectl get all --namespace {{ .Release.Namespace }} -l "release={{ .Release.Name }}, chart={{ $.Chart.Name }}"
diff --git a/helm-charts/aerospike-backup-service/templates/_helpers.tpl b/helm-charts/aerospike-backup-service/templates/_helpers.tpl
new file mode 100644
index 000000000..dc9ee07a1
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/templates/_helpers.tpl
@@ -0,0 +1,44 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "aerospike-backup-service.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "aerospike-backup-service.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Aerospike Backup Service common name.
+*/}}
+{{- define "aerospike-backup-service.commonName" -}}
+{{- if .Values.commonName -}}
+{{- .Values.commonName -}}
+{{- else -}}
+{{- .Release.Name | trunc 63 | replace "-" "" -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "aerospike-backup-service.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "aerospike-backup-service.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "aerospike-backup-service.labels" -}}
+helm.sh/chart: {{ include "aerospike-backup-service.chart" . }}
+{{ include "aerospike-backup-service.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml b/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml
new file mode 100644
index 000000000..75b3c2841
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/templates/aerospike-backup-service-cr.yaml
@@ -0,0 +1,34 @@
+apiVersion: asdb.aerospike.com/v1beta1
+kind: AerospikeBackupService
+metadata:
+  name: {{ template "aerospike-backup-service.commonName" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "aerospike-backup-service.labels" . | nindent 4 }}
+    {{- with .Values.customLabels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  # Aerospike Backup Service image
+  image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
+
+  # Aerospike Backup Service configuration
+  config:
+    {{- .Values.backupServiceConfig | toYaml | nindent 4 }}
+
+  # Secrets to be mounted in the Aerospike Backup Service pod like aws creds etc
+  {{- with .Values.secrets }}
+  secrets: {{- toYaml . | nindent 4 }}
+  {{- end }}
+
+  # Resources for the Aerospike Backup Service pod
+  {{- if .Values.resources }}
+  resources:
+    {{- .Values.resources | toYaml | nindent 4 }}
+  {{- end }}
+
+  # Kubernetes service configuration for the Aerospike Backup Service
+  {{- if .Values.service }}
+  service:
+    {{- .Values.service | toYaml | nindent 4 }}
+  {{- end }}
diff --git a/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml b/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml
new file mode 100644
index 000000000..b62d09c3d
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/templates/serviceaccount.yaml
@@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: aerospike-backup-service
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "aerospike-backup-service.labels" . | nindent 4 }}
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+{{- end }}
diff --git a/helm-charts/aerospike-backup-service/values.yaml b/helm-charts/aerospike-backup-service/values.yaml
new file mode 100644
index 000000000..5b7b7cf0f
--- /dev/null
+++ b/helm-charts/aerospike-backup-service/values.yaml
@@ -0,0 +1,69 @@
+## Default values for aerospike-backup-service.
+## This is a YAML-formatted file.
+## Declare variables to be passed into your templates.
+
+## Aerospike Backup Service common name
+## Defaults to release name truncated to 63 characters (with hyphens removed)
+# commonName: aerobackupservice
+
+nameOverride: ""
+
+## Image is the image for the backup service.
+image:
+  repository: aerospike.jfrog.io/ecosystem-container-prod-local/aerospike-backup-service
+  tag: "1.0.0"
+
+## Custom labels that will be applied on the AerospikeBackupService resource
+customLabels: {}
+
+## ServiceAccount to be used for the Aerospike Backup Service pod
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: true
+  # Annotations to add to the service account
+  annotations: {}
+
+## Config is the free form configuration for the backup service in YAML format.
+## This config is used to start the backup service. The config is passed as a file to the backup service.
+backupServiceConfig: {}
+#  service:
+#    http:
+#      port: 8080
+#  backup-policies:
+#    test-policy:
+#      parallel: 3
+#      remove-files: KeepAll
+#      type: 1
+#  storage:
+#    local:
+#      path: /localStorage
+#      type: local
+#    s3Storage:
+#      type: aws-s3
+#      path: "s3://test-bucket"
+#      s3-region: us-east-1
+#      s3-profile: default
+
+## SecretMounts is the list of secret to be mounted in the backup service.
+secrets: []
+#  - secretName: aws-secret
+#    volumeMount:
+#      name: aws-secret
+#      mountPath: /root/.aws/credentials
+#      subPath: credentials
+
+## Resources define the requests and limits for the backup service container.
+## Resources.Limits should be more than Resources.Requests.
+resources: {}
+#  limits:
+#    cpu: 100m
+#    memory: 128Mi
+#  requests:
+#    cpu: 100m
+#    memory: 128Mi
+
+## Service defines the Kubernetes service configuration for the backup service.
+## It is used to expose the backup service deployment. By default, the service type is ClusterIP.
+service: {}
+#  type: ClusterIP
+
diff --git a/helm-charts/aerospike-backup/.helmignore b/helm-charts/aerospike-backup/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/helm-charts/aerospike-backup/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm-charts/aerospike-backup/Chart.yaml b/helm-charts/aerospike-backup/Chart.yaml
new file mode 100644
index 000000000..b9f088165
--- /dev/null
+++ b/helm-charts/aerospike-backup/Chart.yaml
@@ -0,0 +1,17 @@
+apiVersion: v2
+type: application
+name: aerospike-backup
+
+# version tracks chart changes
+version: 3.3.1
+# appVersion tracks operator version
+appVersion: 3.3.1
+
+description: A Helm chart for Aerospike Backup Custom Resource
+icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4
+
+sources:
+  - https://github.com/aerospike/aerospike-kubernetes-operator
+maintainers:
+  - name: Aerospike
+    email: developers@aerospike.com
diff --git a/helm-charts/aerospike-backup/README.md b/helm-charts/aerospike-backup/README.md
new file mode 100644
index 000000000..fbc268aaf
--- /dev/null
+++ b/helm-charts/aerospike-backup/README.md
@@ -0,0 +1,56 @@
+# Aerospike Backup (Custom Resource) Helm Chart
+
+A Helm chart for `AerospikeBackup` custom resource to be used with the Aerospike Kubernetes Operator.
+
+## Pre Requisites
+
+- Kubernetes 1.19+
+- Aerospike Kubernetes Operator
+
+## Usage
+
+### Add Helm Repository
+
+```sh
+helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise
+helm repo update
+```
+
+### Create Aerospike Backup
+
+#### Install the chart
+
+`<namespace>` used to install Aerospike backup helm chart must be included in `watchNamespaces` value of
+aerospike-kubernetes-operator's `values.yaml`
+
+```sh
+# helm install <chartName> <chartPath> --namespace <namespace>
+helm install aerospike-backup aerospike/aerospike-backup
+```
+
+It is recommended to create a separate YAML file with configurations as per your requirements and use it
+with `helm install`.
+
+```sh
+helm install aerospike-backup aerospike/aerospike-backup \
+    -f <customized-values-yaml-file>
+```
+
+## Configurations
+
+| Name                             | Description                                          | Default    |
+|----------------------------------|------------------------------------------------------|------------|
+| `customLabels`                   | Custom labels to add on the AerospikeBackup resource | `{}` (nil) |
+| `backupService.name`             | Aerospike backup service name                        |            |
+| `backupService.namespace`        | Aerospike backup service namespace                   |            |
+| `backupConfig`                   | Aerospike backup configuration                       | `{}` (nil) |
+| `onDemandBackups[*].id`          | Unique identifier for the on-demand backup           |            |
+| `onDemandBackups[*].routineName` | Routine name used to trigger on-demand backup        |            |
+| `onDemandBackups[*].delay`       | Delay interval before starting the on-demand backup  |            |
+
+### Configurations Explained
+
+[//]: # (TODO: Update below link when the documentation is available.)
+Refer
+to [AerospikeBackup Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec)
+for details on above [configuration fields](#Configurations)
diff --git a/helm-charts/aerospike-backup/templates/NOTES.txt b/helm-charts/aerospike-backup/templates/NOTES.txt
new file mode 100644
index 000000000..373c78e81
--- /dev/null
+++ b/helm-charts/aerospike-backup/templates/NOTES.txt
@@ -0,0 +1,19 @@
+Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}.
+Release Name - {{ .Release.Name }}.
+
+                      +   /\
+         +              .'  '.   *
+                *      /======\      +
+                      ;:.  _   ;
+                      |:. (_)  |
+                      |:.  _   |
+            +         |:. (_)  |          *
+                      ;:.      ;
+                    .' \:.    / `.
+                   / .-'':._.'`-. \
+                   |/    /||\    \|
+
+Run the following commands to get more information about deployment:
+
+$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }}
+$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-backup/templates/_helpers.tpl b/helm-charts/aerospike-backup/templates/_helpers.tpl
new file mode 100644
index 000000000..d6cdda835
--- /dev/null
+++ b/helm-charts/aerospike-backup/templates/_helpers.tpl
@@ -0,0 +1,44 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "aerospike-backup.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "aerospike-backup.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Aerospike Backup common name.
+*/}}
+{{- define "aerospike-backup.commonName" -}}
+{{- if .Values.commonName -}}
+{{- .Values.commonName -}}
+{{- else -}}
+{{- .Release.Name | trunc 63 | replace "-" "" -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "aerospike-backup.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "aerospike-backup.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "aerospike-backup.labels" -}}
+helm.sh/chart: {{ include "aerospike-backup.chart" . }}
+{{ include "aerospike-backup.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml b/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml
new file mode 100644
index 000000000..2290397c2
--- /dev/null
+++ b/helm-charts/aerospike-backup/templates/aerospike-backup-cr.yaml
@@ -0,0 +1,23 @@
+apiVersion: asdb.aerospike.com/v1beta1
+kind: AerospikeBackup
+metadata:
+  name: {{ template "aerospike-backup.commonName" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "aerospike-backup.labels" . | nindent 4 }}
+    {{- with .Values.customLabels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  # Aerospike Backup Service reference
+  backupService:
+    {{- .Values.backupService | toYaml | nindent 4 }}
+
+  # Aerospike Backup configuration
+  config:
+    {{- .Values.backupConfig | toYaml | nindent 4 }}
+
+  # On-demand backups configuration
+  {{- with .Values.onDemandBackups }}
+  onDemandBackups: {{- toYaml . | nindent 4 }}
+  {{- end }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-backup/values.yaml b/helm-charts/aerospike-backup/values.yaml
new file mode 100644
index 000000000..064aa8196
--- /dev/null
+++ b/helm-charts/aerospike-backup/values.yaml
@@ -0,0 +1,43 @@
+# Default values for aerospike-backup.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+nameOverride: ""
+
+## Custom labels that will be applied on the AerospikeBackup resource
+customLabels: {}
+
+## BackupService is the backup service reference i.e. name and namespace.
+## It is used to communicate to the backup service to trigger backups. This field is immutable
+backupService: {}
+#  name: aerospikebackupservice
+#  namespace: aerospike
+
+## Config is the free form configuration for the backup in YAML format.
+## This config is used to trigger backups. It includes: aerospike-cluster, backup-routines
+backupConfig: {}
+#  aerospike-cluster:
+#    aerospike-aerospikebackup-test-cluster:  # Name format: <backup-namespace>-<backup-name>-<cluster-name>
+#      credentials:
+#        password: admin123
+#        user: admin
+#      seed-nodes:
+#        - host-name: aerocluster.aerospike.svc.cluster.local
+#          port: 3000
+#  backup-routines:
+#    aerospike-aerospikebackup-test-routine:  # Name format: <backup-namespace>-<backup-name>-<routine-name>
+#      backup-policy: test-policy
+#      interval-cron: "@daily"
+#      incr-interval-cron: "@hourly"
+#      namespaces: ["test"]
+#      source-cluster: aerospike-aerospikebackup-test-cluster
+#      storage: local
+
+
+## OnDemandBackups is the configuration for on-demand backups.
+onDemandBackups: []
+#  - id: on-demand-backup-1
+#    routineName: aerospike-aerospikebackup-test-routine
+#    delay: 10ms
+
+
diff --git a/helm-charts/aerospike-restore/.helmignore b/helm-charts/aerospike-restore/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/helm-charts/aerospike-restore/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm-charts/aerospike-restore/Chart.yaml b/helm-charts/aerospike-restore/Chart.yaml
new file mode 100644
index 000000000..ef766f049
--- /dev/null
+++ b/helm-charts/aerospike-restore/Chart.yaml
@@ -0,0 +1,17 @@
+apiVersion: v2
+type: application
+name: aerospike-restore
+
+# version tracks chart changes
+version: 3.3.1
+# appVersion tracks operator version
+appVersion: 3.3.1
+
+description: A Helm chart for Aerospike Restore Custom Resource
+icon: https://avatars0.githubusercontent.com/u/2214313?s=200&v=4
+
+sources:
+  - https://github.com/aerospike/aerospike-kubernetes-operator
+maintainers:
+  - name: Aerospike
+    email: developers@aerospike.com
diff --git a/helm-charts/aerospike-restore/README.md b/helm-charts/aerospike-restore/README.md
new file mode 100644
index 000000000..06966f8cb
--- /dev/null
+++ b/helm-charts/aerospike-restore/README.md
@@ -0,0 +1,55 @@
+# Aerospike Restore (Custom Resource) Helm Chart
+
+A Helm chart for `AerospikeRestore` custom resource to be used with the Aerospike Kubernetes Operator.
+
+## Pre Requisites
+
+- Kubernetes 1.19+
+- Aerospike Kubernetes Operator
+
+## Usage
+
+### Add Helm Repository
+
+```sh
+helm repo add aerospike https://aerospike.github.io/aerospike-kubernetes-enterprise
+helm repo update
+```
+
+### Create Aerospike Restore
+
+#### Install the chart
+
+`<namespace>` used to install Aerospike restore helm chart must be included in `watchNamespaces` value of
+aerospike-kubernetes-operator's `values.yaml`
+
+```sh
+# helm install <chartName> <chartPath> --namespace <namespace>
+helm install aerospike-restore aerospike/aerospike-restore
+```
+
+It is recommended to create a separate YAML file with configurations as per your requirements and use it
+with `helm install`.
+
+```sh
+helm install aerospike-restore aerospike/aerospike-restore \
+    -f <customized-values-yaml-file>
+```
+
+## Configurations
+
+| Name                 | Description                                                          | Default    |
+|----------------------|----------------------------------------------------------------------|------------|
+| `customLabels`       | Custom labels to add on the AerospikeRestore resource                | `{}` (nil) |
+| `backupService.name` | Aerospike backup service name                                        |            |
+| `backupService.name` | Aerospike backup service namespace                                   |            |
+| `type`               | Type of restore. It can be of type Full, Incremental, and Timestamp. | `Full`     |
+| `restoreConfig`      | Aerospike restore configuration                                      | `{}` (nil) |
+| `pollingPeriod`      | Polling period for restore operation status                          | `60s`      |
+
+### Configurations Explained
+
+[//]: # (TODO: Update below link when the documentation is available.)
+Refer
+to [AerospikeRestore Customer Resource Spec](https://docs.aerospike.com/cloud/kubernetes/operator/cluster-configuration-settings#spec)
+for details on above [configuration fields](#Configurations)
diff --git a/helm-charts/aerospike-restore/templates/NOTES.txt b/helm-charts/aerospike-restore/templates/NOTES.txt
new file mode 100644
index 000000000..373c78e81
--- /dev/null
+++ b/helm-charts/aerospike-restore/templates/NOTES.txt
@@ -0,0 +1,19 @@
+Thank you for installing {{ .Chart.Name }}-{{ .Chart.AppVersion }}.
+Release Name - {{ .Release.Name }}.
+
+                      +   /\
+         +              .'  '.   *
+                *      /======\      +
+                      ;:.  _   ;
+                      |:. (_)  |
+                      |:.  _   |
+            +         |:. (_)  |          *
+                      ;:.      ;
+                    .' \:.    / `.
+                   / .-'':._.'`-. \
+                   |/    /||\    \|
+
+Run the following commands to get more information about deployment:
+
+$ helm status {{ .Release.Name }} --namespace {{ .Release.Namespace }}
+$ helm get all {{ .Release.Name }} --namespace {{ .Release.Namespace }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-restore/templates/_helpers.tpl b/helm-charts/aerospike-restore/templates/_helpers.tpl
new file mode 100644
index 000000000..473f4f7fc
--- /dev/null
+++ b/helm-charts/aerospike-restore/templates/_helpers.tpl
@@ -0,0 +1,44 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "aerospike-restore.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "aerospike-restore.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Aerospike Restore Service common name.
+*/}}
+{{- define "aerospike-restore.commonName" -}}
+{{- if .Values.commonName -}}
+{{- .Values.commonName -}}
+{{- else -}}
+{{- .Release.Name | trunc 63 | replace "-" "" -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "aerospike-restore.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "aerospike-restore.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "aerospike-restore.labels" -}}
+helm.sh/chart: {{ include "aerospike-restore.chart" . }}
+{{ include "aerospike-restore.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
\ No newline at end of file
diff --git a/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml b/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml
new file mode 100644
index 000000000..3b7d3c8f1
--- /dev/null
+++ b/helm-charts/aerospike-restore/templates/aerospike-restore-cr.yaml
@@ -0,0 +1,24 @@
+apiVersion: asdb.aerospike.com/v1beta1
+kind: AerospikeRestore
+metadata:
+  name: {{ template "aerospike-restore.commonName" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "aerospike-restore.labels" . | nindent 4 }}
+    {{- with .Values.customLabels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  # Aerospike Backup Service reference
+  backupService:
+    {{- .Values.backupService | toYaml | nindent 4 }}
+
+  # Aerospike Restore type
+  type: {{ .Values.type }}
+
+  # Aerospike Restore configuration
+  config:
+    {{- .Values.restoreConfig | toYaml | nindent 4 }}
+
+  # Polling period for restore operation status
+  pollingPeriod: {{ .Values.pollingPeriod }}
diff --git a/helm-charts/aerospike-restore/values.yaml b/helm-charts/aerospike-restore/values.yaml
new file mode 100644
index 000000000..831737553
--- /dev/null
+++ b/helm-charts/aerospike-restore/values.yaml
@@ -0,0 +1,44 @@
+# Default values for aerospike-restore.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+nameOverride: ""
+
+## Custom labels that will be applied on the AerospikeRestore resource
+customLabels: {}
+
+## BackupService is the backup service reference i.e. name and namespace.
+## It is used to communicate to the backup service to trigger restores. This field is immutable
+backupService: {}
+#  name: aerospikebackupservice
+#  namespace: aerospike
+
+## Type is the type of restore. It can be of type Full, Incremental, and Timestamp.
+## Based on the restore type, relevant restore config is given.
+type: Full
+
+## Config is the free form configuration for the restore in YAML format.
+## This config is used to trigger restores. It includes: destination, policy, source, secret-agent, time and routine.
+restoreConfig: {}
+#  destination:
+#    label: destinationCluster
+#    credentials:
+#      password: admin123
+#      user: admin
+#    seed-nodes:
+#      - host-name: aerocluster.test.svc.cluster.local
+#        port: 3000
+#  policy:
+#    parallel: 3
+#    no-generation: true
+#    no-indexes: true
+#  source:
+#    "path": "/localStorage/aerospike-aerospikebackup-test-routine/backup/1722326391329/data/test"
+#    "type": local
+
+
+## Polling period for restore operation status
+## It is used to poll the restore service to fetch restore operation status.Default is 60 seconds.
+pollingPeriod: 60s
+
+