diff --git a/config/monitoring/alertmanager/config/alertmanager.yml b/config/monitoring/alertmanager/config/alertmanager.yml
new file mode 100644
index 000000000..4908f2377
--- /dev/null
+++ b/config/monitoring/alertmanager/config/alertmanager.yml
@@ -0,0 +1,12 @@
+# This is an example alertmanager.yml which sends alert notifications to a slack channel.
+
+global:
+  slack_api_url: "https://hooks.slack.com/services/TXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+route:
+  group_by: ['cluster', 'service']
+  receiver: slack_user
+
+receivers:
+  - name: slack_user
+    slack_configs:
+        - text: "summary: {{ .CommonAnnotations.summary }}\ndescription: {{ .CommonAnnotations.description }}"
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/kustomization.yaml b/config/monitoring/alertmanager/kustomization.yaml
new file mode 100644
index 000000000..bba6090ed
--- /dev/null
+++ b/config/monitoring/alertmanager/kustomization.yaml
@@ -0,0 +1,21 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-alertmanager
+      app.kubernetes.io/component: alertmanager
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+
+configMapGenerator:
+  - name: alertmanager-config
+    files:
+      - config/alertmanager.yml
+
+generatorOptions:
+  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/pvc.yaml b/config/monitoring/alertmanager/pvc.yaml
new file mode 100644
index 000000000..c577bf954
--- /dev/null
+++ b/config/monitoring/alertmanager/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: alertmanager-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/service.yaml b/config/monitoring/alertmanager/service.yaml
new file mode 100644
index 000000000..a2958c64a
--- /dev/null
+++ b/config/monitoring/alertmanager/service.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: alertmanager
+spec:
+  ports:
+    - name: http
+      port: 9093
\ No newline at end of file
diff --git a/config/monitoring/alertmanager/statefulset.yaml b/config/monitoring/alertmanager/statefulset.yaml
new file mode 100644
index 000000000..3dc30efac
--- /dev/null
+++ b/config/monitoring/alertmanager/statefulset.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alertmanager
+spec:
+  template:
+    spec:
+      containers:
+        - name: alertmanager
+          image: prom/alertmanager:latest
+          args:
+            - --config.file=/etc/alertmanager/alertmanager.yml
+            - --storage.path=/alertmanager
+            - --log.level=info
+            - --cluster.advertise-address=0.0.0.0:9093
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9093
+            initialDelaySeconds: 25
+            periodSeconds: 20
+          ports:
+            - containerPort: 9093
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9093
+          volumeMounts:
+            - mountPath: /etc/alertmanager
+              name: alertmanager-conf
+            - mountPath: /alertmanager
+              name: alertmanager-data
+      securityContext:
+        fsGroup: 26
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      volumes:
+        - name: alertmanager-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-alertmanager-data
+        - name: alertmanager-conf
+          configMap:
+            defaultMode: 420
+            name: aerospike-monitoring-stack-alertmanager-config
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml
new file mode 100644
index 000000000..c7a75fe6d
--- /dev/null
+++ b/config/monitoring/grafana/config/aerospike_grafana_dashboards_config.yaml
@@ -0,0 +1,11 @@
+apiVersion: 1
+providers:
+  - name: 'default'
+    folder: 'Aerospike'
+    folderUid: 'aerospike1'
+    type: file
+    disableDeletion: false
+    editable: true
+    updateIntervalSeconds: 10
+    options:
+      path: /var/lib/grafana/dashboards
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml
new file mode 100644
index 000000000..968b2a660
--- /dev/null
+++ b/config/monitoring/grafana/config/aerospike_grafana_datasource.yaml
@@ -0,0 +1,8 @@
+apiVersion: 1
+datasources:
+  - name: "Aerospike Prometheus"
+    type: prometheus
+    access: proxy
+    url: http://aerospike-monitoring-stack-prometheus:9090
+    editable: true
+    isDefault: false
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/download_files.sh b/config/monitoring/grafana/config/download_files.sh
new file mode 100644
index 000000000..eaced64e7
--- /dev/null
+++ b/config/monitoring/grafana/config/download_files.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# Check if curl and jq is installed; if not, install curl and jq
+if ! command -v curl >/dev/null 2>&1 || ! command -v jq >/dev/null 2>&1; then
+    echo "curl or jq not found. Installing..."
+    apk add --no-cache curl jq
+else
+    echo "curl and jq are already installed."
+fi
+
+# Define the dashboards to download in the format <dashboard_id>:<revision> or <dashboard_id>
+DASHBOARDS="16119:10 16115:7 20279"
+
+# Directory where the dashboards will be saved
+TARGET_DIR="/mnt/data"
+mkdir -p "$TARGET_DIR"
+
+DELIMITER=':'
+
+# Loop through each dashboard identifier in DASHBOARDS
+for DASHBOARD in $DASHBOARDS; do
+  if echo "$DASHBOARD" | grep -q "$DELIMITER"; then
+    # If the delimiter ':' exists, split into ID and REVISION
+    ID=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f1)
+    REVISION=$(echo "$DASHBOARD" | cut -d"$DELIMITER" -f2)
+    FILENAME="$ID-rev$REVISION.json"
+    URL="https://grafana.com/api/dashboards/$ID/revisions/$REVISION/download"
+    curl -o "$TARGET_DIR/$FILENAME" "$URL"
+  else
+    # No delimiter, only the ID is provided
+    ID="$DASHBOARD"
+    FILENAME="$ID.json"
+    URL="https://grafana.com/api/dashboards/$ID"
+    curl -s "$URL" | jq '.json' > "$TARGET_DIR/$FILENAME"
+  fi
+done
+
+# List the downloaded files
+echo "Downloaded dashboard files:"
+ls -l "$TARGET_DIR"
\ No newline at end of file
diff --git a/config/monitoring/grafana/config/grafana.ini b/config/monitoring/grafana/config/grafana.ini
new file mode 100644
index 000000000..bc0daea62
--- /dev/null
+++ b/config/monitoring/grafana/config/grafana.ini
@@ -0,0 +1,13 @@
+[analytics]
+    check_for_updates = true
+    [grafana_net]
+    url = https://grafana.net
+    [log]
+    mode = console
+    level = debug
+    [paths]
+    data = /var/lib/grafana/data
+    logs = /var/log/grafana
+    plugins = /var/lib/grafana/plugins
+    [server]
+    http_port = 3000
\ No newline at end of file
diff --git a/config/monitoring/grafana/kustomization.yaml b/config/monitoring/grafana/kustomization.yaml
new file mode 100644
index 000000000..cfd14ad2a
--- /dev/null
+++ b/config/monitoring/grafana/kustomization.yaml
@@ -0,0 +1,30 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-grafana
+      app.kubernetes.io/component: grafana
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+
+configMapGenerator:
+  - name: grafana-provisioning-datasources
+    files:
+      - config/aerospike_grafana_datasource.yaml
+  - name: grafana-config
+    files:
+      - config/grafana.ini
+  - name: grafana-dashboard-config
+    files:
+      - config/aerospike_grafana_dashboards_config.yaml
+  - name: download-script
+    files:
+      - config/download_files.sh
+
+generatorOptions:
+  disableNameSuffixHash: true
diff --git a/config/monitoring/grafana/pvc.yaml b/config/monitoring/grafana/pvc.yaml
new file mode 100644
index 000000000..3bc9acc12
--- /dev/null
+++ b/config/monitoring/grafana/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
\ No newline at end of file
diff --git a/config/monitoring/grafana/service.yaml b/config/monitoring/grafana/service.yaml
new file mode 100644
index 000000000..8068ffb02
--- /dev/null
+++ b/config/monitoring/grafana/service.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+spec:
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 3000
\ No newline at end of file
diff --git a/config/monitoring/grafana/statefulset.yaml b/config/monitoring/grafana/statefulset.yaml
new file mode 100644
index 000000000..26914c6eb
--- /dev/null
+++ b/config/monitoring/grafana/statefulset.yaml
@@ -0,0 +1,88 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: grafana
+spec:
+  replicas: 1
+  template:
+    spec:
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      terminationGracePeriodSeconds: 120
+      initContainers:
+        - name: download-dashboards
+          image: alpine:latest
+          command: ["/bin/sh"]
+          args: [ "-c", "/bin/sh -x /mnt/scripts/download_files.sh" ]
+          volumeMounts:
+            - name: dashboards
+              mountPath: /mnt/data
+            - name: script-volume
+              mountPath: /mnt/scripts
+      containers:
+        - name: grafana
+          image: "grafana/grafana:latest"
+          imagePullPolicy: "IfNotPresent"
+          volumeMounts:
+            - name: grafana-config
+              mountPath: "/etc/grafana/"
+            - name: grafana-provisioning-datasources
+              mountPath: "/etc/grafana/provisioning/datasources"
+            - name: grafana-dashboard-config
+              mountPath: "/etc/grafana/provisioning/dashboards"
+            - name: grafana-data
+              mountPath: "/data"
+            - name: dashboards
+              mountPath: "/var/lib/grafana/dashboards"
+          ports:
+            - name: service
+              containerPort: 80
+              protocol: TCP
+            - name: grafana
+              containerPort: 3000
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 10
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 10
+          env:
+            - name: GF_SECURITY_ADMIN_USER
+              value: "admin"
+            - name: GF_SECURITY_ADMIN_PASSWORD
+              value: "admin"
+            - name: GF_PATHS_DATA
+              value: /data/grafana/data
+      securityContext:
+        fsGroup: 472
+      volumes:
+        - name: grafana-config
+          configMap:
+            name: aerospike-monitoring-stack-grafana-config
+        - name: grafana-provisioning-datasources
+          configMap:
+            name: aerospike-monitoring-stack-grafana-provisioning-datasources
+        - name: grafana-dashboard-config
+          configMap:
+            defaultMode: 420
+            name: aerospike-monitoring-stack-grafana-dashboard-config
+        - name: script-volume
+          configMap:
+            name: aerospike-monitoring-stack-download-script
+        - name: grafana-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-grafana-data
+        - name: dashboards
+          emptyDir: {}
\ No newline at end of file
diff --git a/config/monitoring/kustomization.yaml b/config/monitoring/kustomization.yaml
new file mode 100644
index 000000000..3499301bb
--- /dev/null
+++ b/config/monitoring/kustomization.yaml
@@ -0,0 +1,16 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: monitoring
+
+labels:
+  - includeSelectors: false
+    pairs:
+      app.kubernetes.io/managed-by: kustomize
+      app.kubernetes.io/part-of: aerospike-monitoring-stack
+
+namePrefix: aerospike-monitoring-stack-
+
+resources:
+  - grafana
+  - prometheus
+  - alertmanager
\ No newline at end of file
diff --git a/config/monitoring/prometheus/clusterrole.yaml b/config/monitoring/prometheus/clusterrole.yaml
new file mode 100644
index 000000000..6674295f1
--- /dev/null
+++ b/config/monitoring/prometheus/clusterrole.yaml
@@ -0,0 +1,23 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - namespaces
+      - pods
+      - statefulsets
+      - configmaps
+      - secrets
+      - services
+      - nodes
+      - nodes/metrics
+      - endpoints
+    verbs:
+      - list
+      - watch
+      - get
+  - nonResourceURLs: ["/metrics"]
+    verbs: ["get"]
\ No newline at end of file
diff --git a/config/monitoring/prometheus/clusterrolebinding.yaml b/config/monitoring/prometheus/clusterrolebinding.yaml
new file mode 100644
index 000000000..2ff72f96b
--- /dev/null
+++ b/config/monitoring/prometheus/clusterrolebinding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: aerospike-monitoring-stack-prometheus
+subjects:
+  - kind: ServiceAccount
+    name: aerospike-monitoring-stack-prometheus
+    namespace: monitoring
\ No newline at end of file
diff --git a/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml
new file mode 100644
index 000000000..b0e2d7107
--- /dev/null
+++ b/config/monitoring/prometheus/config/alert-rules/aerospike_rules.yml
@@ -0,0 +1,566 @@
+groups:
+  - name: aerospike.rules
+    rules:
+      - alert: AerospikeExporterAgentDown
+        expr: up{job="aerospike"} == 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Aerospike Prometheus exporter job {{ $labels.instance }} down"
+          description: "{{ $labels.instance }} has been down for more than 30s."
+
+      - alert: AerospikeNodeDown
+        expr: aerospike_node_up{job="aerospike"} == 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Node {{ $labels.instance }} down"
+          description: "{{ $labels.instance }} node is down."
+
+  - name: aerospike_aerospike.rules > NAMESPACE
+    rules:
+      - alert: NamespaceStopWrites
+        expr: aerospike_namespace_stop_writes{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Stop writes for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Used disk space for namespace {{ $labels.ns }} in node {{ $labels.instance }} is above stop writes limit. . <a href='localhost:7100' target='_blank'>namespace view </a>"
+
+      - alert: AerospikeAllFlashAverageObjectsPerSprig
+        expr:  ( ((aerospike_namespace_master_objects { job="aerospike"  }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike"  } ) and ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike"  }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike"  }) ))> 50
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Average Objects per sprig in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeAverageObjectsPerSprig
+        expr:  ( ((aerospike_namespace_master_objects { job="aerospike"  }/4096)/aerospike_namespace_partition_tree_sprigs{ job="aerospike"  } ) unless ignoring (index, sindex) ((aerospike_namespace_index_type_mounts_size_limit { job="aerospike"  }) or (aerospike_namespace_sindex_type_mounts_size_limit { job="aerospike"  }) ))> 5000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Average Objects per sprig in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Average objects per sprig has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeIndexStageSizeWarn
+        # Check here: https://docs.aerospike.com/reference/configuration#index-stage-size
+        #  <128mb or >4gb -- send warn alert
+        expr:  (aerospike_namespace_index_stage_size{job="aerospike" }>4000000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Index stage size configuration is not configured according to documentation in {{ $labels.instance  }}/{{ $labels.ns }}"
+          description: "Index stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeSIndexStageSizeWarn
+        # Check here: https://docs.aerospike.com/reference/configuration#sindex-stage-size
+        #  <128mb or >4gb -- send warn alert
+        expr:  (aerospike_namespace_sindex_stage_size{job="aerospike" }>4000000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "SIndex stage size configuration is not configured according to documentation in {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "SIndex stage size configuration is not configured according to documentation in {{ $labels.ns }} in node {{ $labels.instance }}. "
+
+      - alert: AerospikeIndexPressureDirtyMemoryWarn
+        # Check here: https://docs.aerospike.com/reference/info#index-pressure
+        expr: (((aerospike_namespace_index_pressure_dirty_memory{ job="aerospike"  })/(aerospike_namespace_index_pressure_total_memory{ job="aerospike"  })*100)>10000000)
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Dirty memory ratio against the total memory is above configured limit in node {{ $labels.instance }}"
+          description: "Dirty memory ration against the total memory is above configured limit in node {{ $labels.instance }}"
+
+      - alert: NamespaceDiskCloseToStopWrites
+        expr: (aerospike_namespace_device_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to device_available_pct"
+          description: "device_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit."
+
+      - alert: NamespaceMemoryCloseToStopWrites
+        expr: (aerospike_namespace_stop_writes_pct{job="aerospike" } - (100 - aerospike_namespace_memory_free_pct{job="aerospike" })) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory "
+          description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit."
+
+      - alert: NamespacePmemCloseToStopWrites
+        expr: (aerospike_namespace_pmem_available_pct{job="aerospike" } - aerospike_namespace_storage_engine_min_avail_pct{job="aerospike" }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to pmem_available_pct"
+          description: "pmem_available_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to min-avail-pct (stop writes) limit."
+
+      - alert: NamespaceFreeMemoryCloseToStopWrites
+        expr: (aerospike_namespace_stop_writes_sys_memory_pct{job="aerospike" } - scalar(100 - (aerospike_node_stats_system_free_mem_pct{job="aerospike" }))) <= 10
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to memory"
+          description: "Free memory for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop writes limit."
+
+      - alert: ActiveProxies
+        expr: (increase(aerospike_namespace_client_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_client_proxy_error{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_complete{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_timeout{job="aerospike" }[2m]) + increase(aerospike_namespace_batch_sub_proxy_error{job="aerospike" }[2m])) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Node is doing proxy. Proxies can happen during cluster change / migrations or if there are any network issues."
+          description: "Active proxies detected for {{ $labels.ns }} on node {{ $labels.instance }}"
+
+      - alert: NamespaceSupervisorFallingBehind
+        expr: aerospike_namespace_objects{job="aerospike"}>0 and aerospike_namespace_nsup_cycle_deleted_pct{job="aerospike" } > 1 # (Aerospike 6.3 and later)
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "NSUP is falling behind and/or display the length of time the most recent NSUP cycle lasted"
+          description: "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ $labels.ns }} in node {{ $labels.instance }}"
+
+      - alert: HwmBreached
+        expr: aerospike_namespace_hwm_breached{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "High water mark breached for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "high-water-disk-pct or high-water-memory-pct has been breached for namespace {{ $labels.ns }} in node {{ $labels.instance }}. Eviction may start to recover disk space."
+
+      - alert: LowDeviceAvailWarning
+        expr: aerospike_namespace_device_available_pct{job="aerospike" } < 55
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: LowDeviceAvailCritical
+        expr: aerospike_namespace_device_available_pct{job="aerospike" } < 25
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: ClientTimeouts
+        expr: rate(aerospike_namespace_client_read_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_write_timeout{job="aerospike" }[1m]) > 1 or rate(aerospike_namespace_client_tsvc_timeout{job="aerospike" }[1m]) > 1
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Client transactions are timing out"
+          description: "Client connections timing out at a rate greater than 1/s. Timeouts can occur during network issues or resource contention on the client and/or server nodes."
+
+      - alert: LowMemoryNamespaceWarning
+        expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Memory available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Memory free has dropped below 20% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: LowMemoryNamespaceCritical
+        expr: aerospike_namespace_memory_free_pct{job="aerospike" } < 15
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Memory available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Memory free has dropped below 15% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: DeviceWriteQWarning
+        expr: aerospike_namespace_storage_engine_device_write_q{job="aerospike" } > 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: ShadowDeviceWriteQWarning
+        expr: aerospike_namespace_storage_engine_device_shadow_write_q{job="aerospike" } > 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Shadow device write queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Shadow device write queue is greater than 1 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: DeviceDefragQWarning
+        expr: aerospike_namespace_storage_engine_device_defrag_q{job="aerospike" }> 1000
+        for: 5m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device defrag queue high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Device defrag queue has been above 1000 for more than 5m for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+      - alert: ClockSkewStopWrites
+        expr: aerospike_namespace_clock_skew_stop_writes{job="aerospike" } == 1
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Clock skew stop writes"
+          description: "Clock has skewed for namespace {{ $labels.ns }} in node {{ $labels.instance }}"
+
+      - alert: UnavailablePartitions
+        expr: aerospike_namespace_unavailable_partitions{job="aerospike" } > 0
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Some partitions are inaccessible, and roster nodes are missing from the cluster."
+          description: "Some partitions are not available for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Check for network issues and make sure the cluster forms properly."
+
+      - alert: DeadPartitions
+        expr: aerospike_namespace_dead_partitions{job="aerospike" } > 2
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "There are unavailable partition, but all roster nodes are present in the cluster."
+          description: "Some partitions are dead for namespace {{ $labels.ns }} on node {{ $labels.instance }}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again."
+
+      - alert: NamespaceDataCloseToStopWrites
+        expr: (aerospike_namespace_data_avail_pct{job="aerospike"  } - aerospike_namespace_storage_engine_stop_writes_avail_pct{job="aerospike"  }) <= 10
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Close to stop writes for {{ $labels.instance }}/{{ $labels.ns }} due to data_avail_pct"
+          description: "data_avail_pct for namespace {{ $labels.ns }} in node {{ $labels.instance }} is close to stop-writes-avail-pct limit."
+
+      - alert: LowDataAvailWarning
+        expr: aerospike_namespace_data_avail_pct{job="aerospike"  } < 55
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Device available warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 55% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: LowDataAvailCritical
+        expr: aerospike_namespace_data_avail_pct{job="aerospike"  } < 25
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Device available critically low for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Device available has dropped below 25% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop."
+
+      - alert: HighDataUseNamespaceWarning
+        expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Data utilization warning for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Data used has crossed above 80% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+      - alert: HighDataUseNamespaceCritical
+        expr: aerospike_namespace_data_used_pct{job="aerospike" , storage_engine="memory" } > 85
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Data utilization critically high for {{ $labels.instance }}/{{ $labels.ns }}"
+          description: "Data used has crossed above 85% for namespace {{ $labels.ns }} in node {{ $labels.instance }}. May indicate a need to reduce the object count or increase capacity."
+
+  - name: aerospike_aerospike.rules > NODE
+    rules:
+      - alert: PrometheusNodeExporterNotPresent
+        expr: absent(node_cpu_seconds_total) == 1
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: " Prometheus Node Exporter is not configured "
+          description: " Prometheus Node Exporter is not configured in {{ $labels.instance }} "
+
+      - alert: BestPracticesFailure
+        expr: aerospike_node_stats_failed_best_practices{job="aerospike" } > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}"
+          description: " Best Practices check failed on {{ $labels.instance }} in cluster {{ $labels.cluster_name }}"
+
+      - alert: ClusterSize
+        expr: aerospike_node_stats_cluster_size{job="aerospike" } < 3
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Cluster size lower than expected"
+          description: "Cluster size mismatch for node {{ $labels.instance }}"
+
+      - alert: ClientConnectionsWarning
+        expr: aerospike_node_stats_client_connections{job="aerospike" } > 11
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Client connections warning"
+          description: "Client connections are greater than 11. Connections will fail if they exceed proto-fd-max."
+      - alert: ClientConnectionsCritical
+        expr: aerospike_node_stats_client_connections{job="aerospike" } > 10000
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Client connections critical"
+          description: "Client connections are greater than expected peak of 10000."
+
+      - alert: ClientConnectionChurn
+        expr: rate(aerospike_node_stats_client_connections_opened{job="aerospike" }[1m]) > 100 or rate(aerospike_node_stats_client_connections_closed{job="aerospike" }[1m]) > 100
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Clients are churning connections at a high rate"
+          description: "Client connections are being opened or closed at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections."
+
+      - alert: ClockSkewWarning
+        expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 2000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Cluster clock skew warning{"
+          description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 2000 seconds if nsup-period is non-zero."
+
+      - alert: ClockSkewCritical
+        expr: aerospike_node_stats_cluster_clock_skew_ms{job="aerospike" } > 20000
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Cluster clock skew critical alert"
+          description: "Current maximum clock skew between nodes - will trigger stop writes when it exceeds 20000 if nsup-period is non-zero."
+
+      - alert: LowMemorySystemWarning
+        expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Memory available warning for {{ $labels.instance }}"
+          description: "Total memory free has dropped below 20% for node {{ $labels.instance }}."
+
+      - alert: LowMemorySystemCritical
+        expr: aerospike_node_stats_system_free_mem_pct{job="aerospike" } < 10
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Memory available critically low for {{ $labels.instance }}"
+          description: "Total memory free has dropped below 10% for node {{ $labels.instance }}."
+
+      - alert: HeapEfficiencyWarning
+        #expr: aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60
+        expr: (100 - aerospike_node_stats_system_free_mem_pct{job="aerospike" }) > 70 and aerospike_node_stats_heap_efficiency_pct{job="aerospike" } < 60
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Heap efficiency warning for {{ $labels.instance }}"
+          description: "Heap efficiency for node for {{ $labels.instance }} has dropped below 60%."
+
+      - alert: RwInProgressWarning
+        expr: aerospike_node_stats_rw_in_progress{job="aerospike" }> 100
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read/write queue too high for {{ $labels.instance }}/{{ $labels.ns }}/{{ $labels.device_index }}"
+          description: "Read/write queue is greater than 100 for namespace {{ $labels.ns }} on device {{ $labels.device_index }} in node {{ $labels.instance }}. May indicate underperforming storage subsystem or hotkeys."
+
+  - name: aerospike_aerospike.rules > SET
+    rules:
+      - alert: pre7x_NamespaceSetQuotaWarning
+        expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at  % of the quota you have configured on the set."
+
+      - alert: pre7x_NamespaceSetQuotaAlertCritical
+        expr: (((aerospike_sets_device_data_bytes{job="aerospike" } + aerospike_sets_memory_data_bytes{job="aerospike" }) / (aerospike_sets_stop_writes_size{job="aerospike" } != 0)) * 100) > 99
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+      - alert: NamespaceSetQuotaWarning
+        expr: (((aerospike_sets_data_used_bytes{job="aerospike"  } ) / (aerospike_sets_stop_writes_size{job="aerospike"  } != 0)) * 100) > 80
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          description: "Nearing memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+      - alert: NamespaceSetQuotaAlertCritical
+        expr: (((aerospike_sets_data_used_bytes{job="aerospike"  } ) / (aerospike_sets_stop_writes_size{job="aerospike"  } != 0)) * 100) > 99
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          description: "At or Above memory quota for {{ $labels.set }} in namespace {{ $labels.ns }} in node {{ $labels.instance }}."
+          summary: "One of your nodes is at % of the quota you have configured on the set."
+
+  - name: aerospike_aerospike.rules > LATENCIES
+    rules:
+      - alert: ReadLatencyP95Warning
+        expr: histogram_quantile(0.95, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 2
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "95th percentile read latency breached 2ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: ReadLatencyP99Warning
+        expr: histogram_quantile(0.99, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 4
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance}}"
+          description: "99th percentile read latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: ReadLatencyP999Warning
+        expr: histogram_quantile(0.999, (aerospike_latencies_read_ms_bucket{job="aerospike" })) > 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Read latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99.9th percentile read latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP95Warning
+        expr: histogram_quantile(0.95, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 4
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "95th percentile write latency breached 4ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP99Warning
+        expr: histogram_quantile(0.99, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99th percentile write latency breached 16ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+      - alert: WriteLatencyP999Warning
+        expr: histogram_quantile(0.999, (aerospike_latencies_write_ms_bucket{job="aerospike" })) > 64
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Write latency breached for {{ $labels.ns }} on {{ $labels.instance }}"
+          description: "99.9th percentile write latency breached 64ms for namespace {{ $labels.ns }} on host {{ $labels.instance }}."
+
+
+  - name: aerospike_aerospike.rules > XDR
+    rules:
+
+      - alert: XDRTimelag
+        expr: aerospike_xdr_lag{job="aerospike" } > 5
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR lag for namespace {{ $labels.ns }} exceeding 5 second(s) from node {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR lag may be due to network connectivity issues, inability for the source to keep up with incoming writes, or write failures at the destination."
+      - alert: XDRAbandonedRecords
+        expr: rate(aerospike_xdr_abandoned{job="aerospike" }[1m]) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Abandoned records detected for XDR on node {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination."
+      - alert: XDRRetryNoNode
+        expr: rate(aerospike_xdr_retry_no_node{job="aerospike" }[1m]) > 0
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR retries occuring on node {{ $labels.instance }} to DC {{ $labels.dc }} due to unknown master node destination"
+          description: "XDR cannot determine which destination node is the master."
+
+      - alert: XDRRetryConnReset
+        expr: rate(aerospike_xdr_retry_conn_reset{job="aerospike" }[1m]) > 2
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Rate of XDR connection resets greater than 2/s from {{ $labels.instance }} to DC {{ $labels.dc }} "
+          description: "XDR retries occuring due to due to timeouts, network problems, or destination node restarts."
+
+      - alert: XDRRetryDest
+        expr: rate(aerospike_xdr_retry_dest{job="aerospike" }[1m]) > 5
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Increase in XDR write retries is greater than 5/s from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR retries due to errors returned by the destination node, u.e. key busy or device overload."
+
+      - alert: XDRLatencyWarning
+        expr: aerospike_xdr_latency_ms{job="aerospike" } > 100
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR latency above 100ms from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "Network latency between XDR source and destination over the last 30s is higher than expected."
+
+      - alert: XDRLap
+        expr: aerospike_xdr_lap_us{job="aerospike" } > 75000
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "XDR lap time greater than 75000 microseconds from {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "The XDR processing cycle time (lap_us) is approaching the configured period-ms value."
+
+      - alert: XDRRecoveries
+        expr: increase(aerospike_xdr_recoveries{job="aerospike" }[1m]) > 0
+        for:  2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "XDR recoveries increasing on {{ $labels.instance }} to DC {{ $labels.dc }}"
+          description: "XDR recoveries happen during reind or may indicate that the in-memory transaction queue is full (the transaction-queue-limit may be too small)."
\ No newline at end of file
diff --git a/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml
new file mode 100644
index 000000000..ce2298672
--- /dev/null
+++ b/config/monitoring/prometheus/config/alert-rules/node_exporter_alerts.yml
@@ -0,0 +1,777 @@
+groups:
+  - name: node_exporter_alerts
+    rules:
+      - alert: HostNodeExporterDownCritical
+        expr: up{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host ({{ $labels.instance }}) is down in cluster {{ $labels.cluster_name }} "
+          description: "Failed to scrape {{ $labels.job }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} for more than 1m minutes. node-exporter seems down."
+
+      - alert: HostMemoryFillingUpWarn
+        expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 70
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Memory is filling up (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostMemoryFillingUpCritical
+        expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 ) > 90
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostDiskSpaceFillingUpWarn
+        expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host disk space is filling up on ({{ $labels.instance }})of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 70% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostDiskSpaceFillingUpCritical
+        expr: 100 - (node_filesystem_avail_bytes{job="node-exporter"} * 100) / node_filesystem_size_bytes{job="node-exporter"} > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host disk space is filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 90% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpWarn
+        expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 70 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpCritical
+        expr: 100 - node_filesystem_files_free{job="node-exporter"} / node_filesystem_files{job="node-exporter"} * 100 > 90 and ON (instance, device, mountpoint) node_filesystem_readonly{job="node-exporter"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyWarn
+        expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"})  > 0.1  and (node_disk_reads_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk read latency is increasing (read operations > 0.1s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyCritical
+        expr: (node_disk_read_time_seconds_total{job="node-exporter"}) / (node_disk_reads_completed_total{job="node-exporter"})  > 0.5  and (node_disk_reads_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk read latency is increasing (read operations > 0.5s) on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteLatencyWarn
+        expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"})  > 0.1 and (node_disk_writes_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.1s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyCritical
+        expr: (node_disk_write_time_seconds_total{job="node-exporter"}) / (node_disk_writes_completed_total{job="node-exporter"})  > 0.5 and (node_disk_writes_completed_total{job="node-exporter"}) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write latency ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.5s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationWarn(Host)
+        expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 70%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationCritical(Host)
+        expr: sum by (instance) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 90%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationWarn(Core)
+        expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 70
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 70%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostHighCpuUtilizationCritical(Core)
+        expr: sum by (instance, cpu) (node_cpu_seconds_total{mode!="idle", job="node-exporter"}) / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 90
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU load on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is crossing (> 90%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealWarn(Host)
+        expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 3% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealCritical(Host)
+        expr: sum by (instance)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 5%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealWarn(Core)
+        expr: sum by (instance, cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 3%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostCpuStealCritical(Core)
+        expr: sum by (instance ,cpu)(node_cpu_seconds_total{mode="steal", job="node-exporter"})  / on(instance) group_left sum by (instance)((node_cpu_seconds_total{job="node-exporter"})) * 100 > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is (> 5%) for ({{ $labels.cpu }}) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. "
+
+      - alert: HostNetworkReceiveErrorsWarn
+        expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostNetworkReceiveErrorsCritical
+        expr: ((node_network_receive_errs_total{job="node-exporter"}) / (node_network_receive_packets_total{job="node-exporter"})) * 100 > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }} "
+
+      - alert: HostNetworkTransmitErrorsWarn
+        expr:  ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostNetworkTransmitErrorsCritical
+        expr:  ((node_network_transmit_errs_total{job="node-exporter"}) / (node_network_transmit_packets_total{job="node-exporter"})) * 100 > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors for {{ $labels.device }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedWarn
+        expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.8
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Interface Saturated ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface is getting overloaded (> 0.8) on host  {{ $labels.instance }} of cluster {{ $labels.cluster_name }} {{ $value }}.  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedCritical
+        expr: ((node_network_receive_bytes_total{job="node-exporter"}) + (node_network_transmit_bytes_total{job="node-exporter"})) / (node_network_speed_bytes{job="node-exporter"}) > 0.9
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.interface }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface is getting overloaded (> 0.9) {{ $value }} on host {{ $labels.instance }}:{{ $labels.interface }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostClockNotSynchronisingWarn
+        expr: min_over_time(node_timex_sync_status{job="node-exporter"}[2m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16
+        for: 2m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host clock not synchronising on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Clock not synchronising on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostSwapInWarn
+        expr: (node_vmstat_pswpin{job="node-exporter"})  > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 5 on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapInCritical
+        expr: (node_vmstat_pswpin{job="node-exporter"})  > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host PageSwapIn value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn(data from swap space on disk back into the physical memory (RAM)) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapOutWarn
+        expr: (node_vmstat_pswpout{job="node-exporter"}) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut(move data from RAM to swap space on disk to free up space in memory) value exceeds 5 on host {{ $labels.instance }}  of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostSwapOutCritical
+        expr: (node_vmstat_pswpout{job="node-exporter"}) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host PageSwapOut value is too high on {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut(move data from RAM to swap space on disk to free up space in physical memory) value exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+
+      - alert: HostMemoryFillingUpWarn(Rate)
+        expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 15
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 15%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryFillingUpCritical(Rate)
+        expr: (rate(node_memory_MemAvailable_bytes{job="node-exporter"}[ 1m]) / node_memory_MemTotal_bytes{job="node-exporter"}) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Node memory is filling up (> 30%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryUnderMemoryPressureWarn(Rate)
+        expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 3
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostMemoryUnderMemoryPressureCritical(Rate)
+        expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host memory under memory pressure on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "High rate of major page faults on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostDiskSpaceFillingUpWarn(Rate)
+        expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 15
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 15% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostDiskSpaceFillingUpCritical(Rate)
+        expr: avg by (instance)(rate(node_filesystem_avail_bytes{job="node-exporter"}[1m]) * 100 / node_filesystem_size_bytes{job="node-exporter"}) > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host disk space as filling up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is crossing (> 30% ) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostInodesFillingUpWarn(Rate)
+        expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host inodes filling Up on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 20%) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostInodesFillingUpCritical(Rate)
+        expr: (rate(node_filesystem_files_free{job="node-exporter"}[1m])) / node_filesystem_files{job="node-exporter"} * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host inodes filling Up of ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk is running out of available inodes (> 30%)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadLatencyWarn(Rate)
+        expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk Read latency is increasing (read operations > 0.05s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskReadLatencyCritical(Rate)
+        expr: rate(node_disk_read_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk Read latency is increasing (read operations > 0.1s)   on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyWarn(Rate)
+        expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.05 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.05s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskWriteLatencyCritical(Rate)
+        expr: rate(node_disk_write_time_seconds_total{job="node-exporter"}[1m]) / rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{job="node-exporter"}[1m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write latency on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk write latency is increasing (write operations > 0.1s)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationWarn(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 20%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationCritical(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 30%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationWarn(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host high CPU utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 20%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateCPUutilizationCritical(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host high CPU Utilization on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU utilization is > 30%  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Host)
+        expr: (sum by(instance) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 5% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostCpuStealRate(Core)
+        expr: (sum by(instance, cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode="steal"}[1m])) / on(instance) group_left sum by (instance)((rate(node_cpu_seconds_total{job="node-exporter"}[1m])))) * 100 > 8
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU steal on ({{ $labels.instance }}:{{ $labels.cpu }}) of cluster {{ $labels.cluster_name }}"
+          description: "CPU steal is > 8% on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }}"
+
+      - alert: HostContextSwitchingWarn(Rate)
+        expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 1000
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Context switching is increasing (> 1000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostContextSwitchingCritical(Rate)
+        expr: (rate(node_context_switches_total{job="node-exporter"}[1m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node-exporter"})) > 2000
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host context switching on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Context switching is increasing (> 2000 /s) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostNetworkReceiveErrorsWarn(Rate)
+        expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostNetworkReceiveErrorsCritical(Rate)
+        expr: rate(node_network_receive_errs_total{job="node-exporter"}[30s]) / rate(node_network_receive_packets_total{job="node-exporter"}[30s]) > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Receive Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance interface has encountered {{ $value }} receive errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostNetworkTransmitErrorsWarn(Rate)
+        expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 3
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostNetworkTransmitErrorsCritical(Rate)
+        expr: rate(node_network_transmit_errs_total{job="node-exporter"}[30s]) / rate(node_network_transmit_packets_total{job="node-exporter"}[30s]) > 5
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Transmit Errors on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Instance has encountered {{ $value }} transmit errors on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedWarn(Rate)
+        expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 80
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostNetworkInterfaceSaturatedCritical(Rate)
+        expr: ((rate(node_network_receive_bytes_total{job="node-exporter"}[1m]) + rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])) / (node_network_speed_bytes{job="node-exporter"})) * 100 > 90
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host Network Interface Saturated on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "The network interface rate is getting overloaded {{ $value }} on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}.  VALUE = {{ $value }}"
+
+      - alert: HostSwapInRateWarn
+        expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapInRateCritical
+        expr: rate(node_vmstat_pswpin{job="node-exporter"}[1m]) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PageSwapIn rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapIn rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapOutRateWarn
+        expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 5
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut rate exceeds 5 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostSwapOutRatecritical
+        expr: rate(node_vmstat_pswpout{job="node-exporter"}[1m]) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PageSwapOut rate is too high on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "PageSwapOut rate exceeds 10 on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current rate is {{ $value }}."
+
+      - alert: HostDiskReadIOPSWarn(Host)
+        expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 300
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSCritical(Host)
+        expr: sum by(instance) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 500
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk read IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSWarn(Device)
+        expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 100
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskReadIOPSCritical(Device)
+        expr: sum by(instance, device) (rate(node_disk_reads_completed_total{job="node-exporter"}[1m])) > 250
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk read IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk read IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSWarn(Host)
+        expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 300
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 300) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSCritical(Host)
+        expr: sum by(instance) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 500
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk write IOPS detected on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 500) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSWarn(Device)
+        expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 100
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 100) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostDiskWriteIOPSCritical(Device)
+        expr: sum by(instance, device) (rate(node_disk_writes_completed_total{job="node-exporter"}[1m])) > 250
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High disk write IOPS detected on host ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Sustained high disk write IOPS rate (> 250) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }}. Current value is {{ $value }}."
+
+      - alert: HostRateUnusualNetworkThroughputInWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 20  or  100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 20/ < -20%)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 20  or  100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 20/ < -20 %)  on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputInCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[30s]))) * 100) / (sum by (instance, device)(rate(node_network_receive_bytes_total{job="node-exporter"}[1m]))))  < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput in rate ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably receiving data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  > 30 or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 30/ < -30 %)  on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutWarn(Device)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 1m
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 20/ < -20 %) on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostRateUnusualNetworkThroughputOutCritical(Device)
+        expr: 100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  > 30  or  100 -  (((sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[30s]))) * 100) /  (sum by (instance)(rate(node_network_transmit_bytes_total{job="node-exporter"}[1m]))))  < -30
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual network throughput out rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Host network interfaces are probably sending data (> 30/ < -30 %)  on {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }}"
+
+      - alert: HostUnusualDiskReadRateWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20  or 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading less data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskReadRateCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_read_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk read rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably reading data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateWarn(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20 or  100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateCritical(Host)
+        expr: 100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateWarn(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 20  or  100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -20
+        for: 30s
+        labels:
+          severity: warn
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 20/ < -20 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
+
+      - alert: HostUnusualDiskWriteRateCritical(Device)
+        expr: 100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) > 30  or  100 -  (((sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[30s]))) * 100)  /  (sum by (instance, device)(rate(node_disk_written_bytes_total{job="node-exporter"}[1m])))) < -30
+        for: 30s
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host unusual disk write rate on ({{ $labels.instance }}:{{ $labels.device }}) of cluster {{ $labels.cluster_name }}"
+          description: "Disk rate is probably writing data (> 30/ < -30 %) on host {{ $labels.instance }} of cluster {{ $labels.cluster_name }} VALUE = {{ $value }} "
diff --git a/config/monitoring/prometheus/config/prometheus.yml b/config/monitoring/prometheus/config/prometheus.yml
new file mode 100644
index 000000000..c5b45d07a
--- /dev/null
+++ b/config/monitoring/prometheus/config/prometheus.yml
@@ -0,0 +1,88 @@
+global:
+  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
+  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+alerting:
+  alertmanagers:
+    - kubernetes_sd_configs:
+        - role: pod
+          selectors:
+            - role: pod
+              label: app.kubernetes.io/component=alertmanager
+rule_files:
+  - "/etc/prometheus/alert-rules.d/aerospike_rules.yml"
+  - "/etc/prometheus/alert-rules.d/node_exporter_alerts.yml"
+scrape_configs:
+  - job_name: "aerospike-kubernetes-operator"
+    honor_timestamps: true
+    scrape_interval: 15s
+    scrape_timeout: 10s
+    metrics_path: /metrics
+    scheme: https
+    bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+    tls_config:
+      insecure_skip_verify: true
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_service_label_control_plane, __meta_kubernetes_service_labelpresent_control_plane]
+        separator: ;
+        regex: (controller-manager);true
+        replacement: $1
+        action: keep
+      - source_labels: [__meta_kubernetes_endpoint_port_name]
+        separator: ;
+        regex: https
+        replacement: $1
+        action: keep
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names:
+            - default
+            - aerospike
+#  - job_name: "kubernetes-cadvisor"
+#    scheme: https
+#    metrics_path: /metrics/cadvisor
+#    kubernetes_sd_configs:
+#      - role: node
+#    tls_config:
+#      ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+#    authorization:
+#      credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+#    relabel_configs:
+#      - action: labelmap
+#        regex: __meta_kubernetes_node_label_(.+)
+  - job_name: 'event_exporter' # https://github.com/caicloud/event_exporter/blob/master/deploy/README.md
+    static_configs:
+      - targets: ['event-exporter:9102']
+  - job_name: 'node-exporter' # https://devopscube.com/node-exporter-kubernetes/
+    kubernetes_sd_configs:
+      - role: endpoints
+    relabel_configs:
+      - source_labels: [ __meta_kubernetes_endpoints_name ]
+        regex: 'node-exporter'
+        action: keep
+  - job_name: 'aerospike'
+    kubernetes_sd_configs:
+      - role: endpoints
+        namespaces:
+          names:
+            - default
+            - aerospike
+    relabel_configs:
+      - source_labels:
+          - __meta_kubernetes_namespace
+        target_label: namespace
+      - source_labels:
+          - __meta_kubernetes_pod_name
+        target_label: pod
+      - action: keep
+        regex: aerospike-cluster
+        replacement: $1
+        separator: ;
+        source_labels:
+          - __meta_kubernetes_pod_label_app
+      - action: keep
+        regex: exporter
+        replacement: $1
+        separator: ;
+        source_labels:
+          - __meta_kubernetes_pod_container_port_name
\ No newline at end of file
diff --git a/config/monitoring/prometheus/kustomization.yaml b/config/monitoring/prometheus/kustomization.yaml
new file mode 100644
index 000000000..b75eeaee4
--- /dev/null
+++ b/config/monitoring/prometheus/kustomization.yaml
@@ -0,0 +1,28 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/name: aerospike-prometheus
+      app.kubernetes.io/component: prometheus
+
+resources:
+  - statefulset.yaml
+  - pvc.yaml
+  - service.yaml
+  - serviceaccount.yaml
+  - clusterrole.yaml
+  - clusterrolebinding.yaml
+
+configMapGenerator:
+  - name: prometheus-config
+    files:
+      - config/prometheus.yml
+  - name: alert-rules-config
+    files:
+      - config/alert-rules/aerospike_rules.yml
+      - config/alert-rules/node_exporter_alerts.yml
+
+generatorOptions:
+  disableNameSuffixHash: true
\ No newline at end of file
diff --git a/config/monitoring/prometheus/pvc.yaml b/config/monitoring/prometheus/pvc.yaml
new file mode 100644
index 000000000..d722303c3
--- /dev/null
+++ b/config/monitoring/prometheus/pvc.yaml
@@ -0,0 +1,10 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: prometheus-data
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
diff --git a/config/monitoring/prometheus/service.yaml b/config/monitoring/prometheus/service.yaml
new file mode 100644
index 000000000..e25d1ac2b
--- /dev/null
+++ b/config/monitoring/prometheus/service.yaml
@@ -0,0 +1,11 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: prometheus
+spec:
+  ports:
+    - name: http
+      port: 9090
+      protocol: TCP
+      targetPort: 9090
+  sessionAffinity: ClientIP
\ No newline at end of file
diff --git a/config/monitoring/prometheus/serviceaccount.yaml b/config/monitoring/prometheus/serviceaccount.yaml
new file mode 100644
index 000000000..f671fc5ab
--- /dev/null
+++ b/config/monitoring/prometheus/serviceaccount.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
\ No newline at end of file
diff --git a/config/monitoring/prometheus/statefulset.yaml b/config/monitoring/prometheus/statefulset.yaml
new file mode 100644
index 000000000..094f9508d
--- /dev/null
+++ b/config/monitoring/prometheus/statefulset.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+spec:
+  replicas: 1
+  podManagementPolicy: Parallel
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    spec:
+      serviceAccountName: aerospike-monitoring-stack-prometheus
+      containers:
+        - name: prometheus-server
+          image: "prom/prometheus:latest"
+          imagePullPolicy: "IfNotPresent"
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/data
+            - --web.listen-address=:9090
+          ports:
+            - containerPort: 9090
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 10
+            successThreshold: 1
+            failureThreshold: 3
+          volumeMounts:
+            - name: config-volume
+              mountPath: /etc/prometheus
+            - name: prometheus-data
+              mountPath: /data
+            - mountPath: /etc/prometheus/alert-rules.d
+              name: alert-manager-rules
+      terminationGracePeriodSeconds: 120
+      securityContext:
+        fsGroup: 65534
+      volumes:
+        - name: config-volume
+          configMap:
+            name: aerospike-monitoring-stack-prometheus-config
+        - name: prometheus-data
+          persistentVolumeClaim:
+            claimName: aerospike-monitoring-stack-prometheus-data
+        - name: alert-manager-rules
+          configMap:
+              defaultMode: 420
+              name: aerospike-monitoring-stack-alert-rules-config
\ No newline at end of file
diff --git a/helm-charts/aerospike-kubernetes-operator/values.yaml b/helm-charts/aerospike-kubernetes-operator/values.yaml
index faea89650..6cd9c0b73 100644
--- a/helm-charts/aerospike-kubernetes-operator/values.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/values.yaml
@@ -28,7 +28,7 @@ certs:
   webhookServerCertSecretName: "webhook-server-cert"
 
 ##  Operator configurations
-watchNamespaces: "default"
+watchNamespaces: "default,aerospike"
 
 # Registry used to pull aerospike-init image
 aerospikeKubernetesInitRegistry: "docker.io"