From 6ca6a81cd94e7fe2dfd478c5cb47a02b2eb0745d Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Mon, 20 Nov 2023 12:38:30 +0530
Subject: [PATCH 01/17] Allow Operator operations with failed pods

---
 api/v1/aerospikecluster_types.go              | 32 ++++++++++
 api/v1/zz_generated.deepcopy.go               | 10 ++++
 .../asdb.aerospike.com_aerospikeclusters.yaml | 20 +++++++
 ...rnetes-operator.clusterserviceversion.yaml | 11 ++++
 controllers/aero_info_calls.go                | 34 +++++------
 controllers/pod.go                            | 52 +++++++++++-----
 controllers/rack.go                           | 60 +++++++++----------
 controllers/reconciler.go                     | 19 ++++--
 controllers/statefulset.go                    | 13 ++--
 controllers/strong_consistency.go             | 23 +++----
 ..._aerospikeclusters.asdb.aerospike.com.yaml | 20 +++++++
 11 files changed, 205 insertions(+), 89 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 089bed614..0846f4ead 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -72,7 +72,13 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Seeds Finder Services"
 	SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"`
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
+	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster NodeB lockList"
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
+	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
+	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
+	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored
+	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Ignore Pod List"
+	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
 type SeedsFinderServices struct {
@@ -615,6 +621,10 @@ type AerospikeClusterStatusSpec struct { //nolint:govet // for readability
 	SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"`
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
+	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
+	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
+	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored
+	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
 // AerospikeClusterStatus defines the observed state of AerospikeCluster
@@ -937,6 +947,17 @@ func CopySpecToStatus(spec *AerospikeClusterSpec) (*AerospikeClusterStatusSpec,
 		status.RosterNodeBlockList = rosterNodeBlockList
 	}
 
+	// IgnorePodList
+	if len(spec.IgnorePodList) != 0 {
+		var ignorePodList []string
+
+		lib.DeepCopy(
+			&ignorePodList, &spec.IgnorePodList,
+		)
+
+		status.IgnorePodList = ignorePodList
+	}
+
 	return &status, nil
 }
 
@@ -1027,5 +1048,16 @@ func CopyStatusToSpec(status *AerospikeClusterStatusSpec) (*AerospikeClusterSpec
 		spec.RosterNodeBlockList = rosterNodeBlockList
 	}
 
+	// IgnorePodList
+	if len(status.IgnorePodList) != 0 {
+		var ignorePodList []string
+
+		lib.DeepCopy(
+			&ignorePodList, &status.IgnorePodList,
+		)
+
+		spec.IgnorePodList = ignorePodList
+	}
+
 	return &spec, nil
 }
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index a68fa7d32..b1ba3adde 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -183,6 +183,11 @@ func (in *AerospikeClusterSpec) DeepCopyInto(out *AerospikeClusterSpec) {
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
+	if in.IgnorePodList != nil {
+		in, out := &in.IgnorePodList, &out.IgnorePodList
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AerospikeClusterSpec.
@@ -255,6 +260,11 @@ func (in *AerospikeClusterStatusSpec) DeepCopyInto(out *AerospikeClusterStatusSp
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
+	if in.IgnorePodList != nil {
+		in, out := &in.IgnorePodList, &out.IgnorePodList
+		*out = make([]string, len(*in))
+		copy(*out, *in)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AerospikeClusterStatusSpec.
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index a60371e1c..06b00dca3 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -277,6 +277,16 @@ spec:
                     - customInterface
                     type: string
                 type: object
+              ignorePodList:
+                description: IgnorePodList is the list of pods which are ignored by
+                  the operator while checking the cluster stability and are not considered
+                  part of cluster. This is only useful when there are some failed
+                  pods and operator is required to do some operation on the cluster.
+                  If pods in running state are defined in this list, they are not
+                  ignored
+                items:
+                  type: string
+                type: array
               image:
                 description: Aerospike server image
                 type: string
@@ -8886,6 +8896,16 @@ spec:
                     - customInterface
                     type: string
                 type: object
+              ignorePodList:
+                description: IgnorePodList is the list of pods which are ignored by
+                  the operator while checking the cluster stability and are not considered
+                  part of cluster. This is only useful when there are some failed
+                  pods and operator is required to do some operation on the cluster.
+                  If pods in running state are defined in this list, they are not
+                  ignored
+                items:
+                  type: string
+                type: array
               image:
                 description: Aerospike server image
                 type: string
diff --git a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
index 53aa7c3f5..795d2ec4e 100644
--- a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
@@ -47,6 +47,13 @@ spec:
           the Aerospike cluster.
         displayName: Aerospike Network Policy
         path: aerospikeNetworkPolicy
+      - description: IgnorePodList is the list of pods which are ignored by the operator
+          while checking the cluster stability and are not considered part of cluster.
+          This is only useful when there are some failed pods and operator is required
+          to do some operation on the cluster. If pods in running state are defined
+          in this list, they are not ignored
+        displayName: Ignore Pod List
+        path: ignorePodList
       - description: Aerospike server image
         displayName: Server Image
         path: image
@@ -60,6 +67,10 @@ spec:
           cluster. Pods will be deployed in given racks based on given configuration
         displayName: Rack Config
         path: rackConfig
+      - description: RosterNodeBlockList is a list of blocked nodeIDs from roster
+          in a strong-consistency setup
+        displayName: Roster NodeB lockList
+        path: rosterNodeBlockList
       - description: SeedsFinderServices creates additional Kubernetes service that
           allow clients to discover Aerospike cluster nodes.
         displayName: Seeds Finder Services
diff --git a/controllers/aero_info_calls.go b/controllers/aero_info_calls.go
index 0978ee2b3..0c8ca967c 100644
--- a/controllers/aero_info_calls.go
+++ b/controllers/aero_info_calls.go
@@ -18,6 +18,7 @@ import (
 	"time"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
 
 	as "github.com/aerospike/aerospike-client-go/v6"
 	asdbv1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1"
@@ -34,19 +35,19 @@ import (
 // The ignorablePods list should be a list of failed or pending pods that are going to be
 // deleted eventually and are safe to ignore in stability checks.
 func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
-	pods []*corev1.Pod, ignorablePods []corev1.Pod,
+	pods []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) reconcileResult {
 	if len(pods) == 0 {
 		return reconcileSuccess()
 	}
 
 	// Remove a node only if cluster is stable
-	if err := r.waitForAllSTSToBeReady(); err != nil {
+	if err := r.waitForAllSTSToBeReady(ignorablePodNames); err != nil {
 		return reconcileError(fmt.Errorf("failed to wait for cluster to be ready: %v", err))
 	}
 
 	// This doesn't make actual connection, only objects having connection info are created
-	allHostConns, err := r.newAllHostConnWithOption(ignorablePods)
+	allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		return reconcileError(fmt.Errorf("failed to get hostConn for aerospike cluster nodes: %v", err))
 	}
@@ -64,12 +65,12 @@ func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
 	}
 
 	// Setup roster after migration.
-	if err = r.getAndSetRoster(policy, r.aeroCluster.Spec.RosterNodeBlockList, ignorablePods); err != nil {
+	if err = r.getAndSetRoster(policy, r.aeroCluster.Spec.RosterNodeBlockList, ignorablePodNames); err != nil {
 		r.Log.Error(err, "Failed to set roster for cluster")
 		return reconcileRequeueAfter(1)
 	}
 
-	if err := r.quiescePods(policy, allHostConns, pods, ignorablePods); err != nil {
+	if err := r.quiescePods(policy, allHostConns, pods, ignorablePodNames); err != nil {
 		return reconcileError(err)
 	}
 
@@ -77,7 +78,7 @@ func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
 }
 
 func (r *SingleClusterReconciler) quiescePods(
-	policy *as.ClientPolicy, allHostConns []*deployment.HostConn, pods []*corev1.Pod, ignorablePods []corev1.Pod,
+	policy *as.ClientPolicy, allHostConns []*deployment.HostConn, pods []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) error {
 	podList := make([]corev1.Pod, 0, len(pods))
 
@@ -85,7 +86,7 @@ func (r *SingleClusterReconciler) quiescePods(
 		podList = append(podList, *pods[idx])
 	}
 
-	selectedHostConns, err := r.newPodsHostConnWithOption(podList, ignorablePods)
+	selectedHostConns, err := r.newPodsHostConnWithOption(podList, ignorablePodNames)
 	if err != nil {
 		return err
 	}
@@ -173,15 +174,9 @@ func (r *SingleClusterReconciler) alumniReset(pod *corev1.Pod) error {
 	return asConn.AlumniReset(r.getClientPolicy())
 }
 
-func (r *SingleClusterReconciler) newAllHostConn() (
-	[]*deployment.HostConn, error,
-) {
-	return r.newAllHostConnWithOption(nil)
-}
-
 // newAllHostConnWithOption returns connections to all pods in the cluster skipping pods that are not running and
 // present in ignorablePods.
-func (r *SingleClusterReconciler) newAllHostConnWithOption(ignorablePods []corev1.Pod) (
+func (r *SingleClusterReconciler) newAllHostConnWithOption(ignorablePodNames sets.Set[string]) (
 	[]*deployment.HostConn, error,
 ) {
 	podList, err := r.getClusterPodList()
@@ -193,12 +188,12 @@ func (r *SingleClusterReconciler) newAllHostConnWithOption(ignorablePods []corev
 		return nil, fmt.Errorf("pod list empty")
 	}
 
-	return r.newPodsHostConnWithOption(podList.Items, ignorablePods)
+	return r.newPodsHostConnWithOption(podList.Items, ignorablePodNames)
 }
 
 // newPodsHostConnWithOption returns connections to all pods given skipping pods that are not running and
 // present in ignorablePods.
-func (r *SingleClusterReconciler) newPodsHostConnWithOption(pods, ignorablePods []corev1.Pod) (
+func (r *SingleClusterReconciler) newPodsHostConnWithOption(pods []corev1.Pod, ignorablePodNames sets.Set[string]) (
 	[]*deployment.HostConn, error,
 ) {
 	hostConns := make([]*deployment.HostConn, 0, len(pods))
@@ -211,8 +206,7 @@ func (r *SingleClusterReconciler) newPodsHostConnWithOption(pods, ignorablePods
 
 		// Checking if all the container in the pod are ready or not
 		if !utils.IsPodRunningAndReady(pod) {
-			ignorablePod := utils.GetPod(pod.Name, ignorablePods)
-			if ignorablePod != nil {
+			if ignorablePodNames.Has(pod.Name) {
 				// This pod is not running and ignorable.
 				r.Log.Info(
 					"Ignoring info call on non-running pod ", "pod", pod.Name,
@@ -259,7 +253,7 @@ func hostID(hostName string, hostPort int) string {
 
 func (r *SingleClusterReconciler) setMigrateFillDelay(
 	policy *as.ClientPolicy,
-	asConfig *asdbv1.AerospikeConfigSpec, setToZero bool, ignorablePods []corev1.Pod,
+	asConfig *asdbv1.AerospikeConfigSpec, setToZero bool, ignorablePodNames sets.Set[string],
 ) reconcileResult {
 	migrateFillDelay, err := asdbv1.GetMigrateFillDelay(asConfig)
 	if err != nil {
@@ -286,7 +280,7 @@ func (r *SingleClusterReconciler) setMigrateFillDelay(
 	}
 
 	// This doesn't make actual connection, only objects having connection info are created
-	allHostConns, err := r.newAllHostConnWithOption(ignorablePods)
+	allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		return reconcileError(
 			fmt.Errorf(
diff --git a/controllers/pod.go b/controllers/pod.go
index 8574b3c1c..b86aab7a6 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -146,10 +146,10 @@ func (r *SingleClusterReconciler) getRollingRestartTypePod(
 }
 
 func (r *SingleClusterReconciler) rollingRestartPods(
-	rackState *RackState, podsToRestart []*corev1.Pod, ignorablePods []corev1.Pod,
+	rackState *RackState, podsToRestart []*corev1.Pod, ignorablePodNames sets.Set[string],
 	restartTypeMap map[string]RestartType,
 ) reconcileResult {
-	failedPods, activePods := getFailedAndActivePods(podsToRestart)
+	failedPods, activePods := getFailedAndActivePods(podsToRestart, ignorablePodNames)
 
 	// If already dead node (failed pod) then no need to check node safety, migration
 	if len(failedPods) != 0 {
@@ -163,7 +163,7 @@ func (r *SingleClusterReconciler) rollingRestartPods(
 	if len(activePods) != 0 {
 		r.Log.Info("Restart active pods", "pods", getPodNames(activePods))
 
-		if res := r.waitForMultipleNodesSafeStopReady(activePods, ignorablePods); !res.isSuccess {
+		if res := r.waitForMultipleNodesSafeStopReady(activePods, ignorablePodNames); !res.isSuccess {
 			return res
 		}
 
@@ -345,9 +345,14 @@ func (r *SingleClusterReconciler) ensurePodsRunningAndReady(podsToCheck []*corev
 	return reconcileRequeueAfter(10)
 }
 
-func getFailedAndActivePods(pods []*corev1.Pod) (failedPods, activePods []*corev1.Pod) {
+func getFailedAndActivePods(pods []*corev1.Pod, ignorablePodNames sets.Set[string],
+) (failedPods, activePods []*corev1.Pod) {
 	for idx := range pods {
 		pod := pods[idx]
+		if ignorablePodNames.Has(pod.Name) {
+			continue
+		}
+
 		if err := utils.CheckPodFailed(pod); err != nil {
 			failedPods = append(failedPods, pod)
 			continue
@@ -360,9 +365,9 @@ func getFailedAndActivePods(pods []*corev1.Pod) (failedPods, activePods []*corev
 }
 
 func (r *SingleClusterReconciler) safelyDeletePodsAndEnsureImageUpdated(
-	rackState *RackState, podsToUpdate []*corev1.Pod, ignorablePods []corev1.Pod,
+	rackState *RackState, podsToUpdate []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) reconcileResult {
-	failedPods, activePods := getFailedAndActivePods(podsToUpdate)
+	failedPods, activePods := getFailedAndActivePods(podsToUpdate, ignorablePodNames)
 
 	// If already dead node (failed pod) then no need to check node safety, migration
 	if len(failedPods) != 0 {
@@ -376,7 +381,7 @@ func (r *SingleClusterReconciler) safelyDeletePodsAndEnsureImageUpdated(
 	if len(activePods) != 0 {
 		r.Log.Info("Restart active pods with updated container image", "pods", getPodNames(activePods))
 
-		if res := r.waitForMultipleNodesSafeStopReady(activePods, ignorablePods); !res.isSuccess {
+		if res := r.waitForMultipleNodesSafeStopReady(activePods, ignorablePodNames); !res.isSuccess {
 			return res
 		}
 
@@ -415,7 +420,7 @@ func (r *SingleClusterReconciler) deletePodAndEnsureImageUpdated(
 
 func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.Pod) reconcileResult {
 	podNames := getPodNames(podsToCheck)
-	updatedPods := map[string]bool{}
+	updatedPods := sets.Set[string]{}
 
 	const (
 		maxRetries    = 6
@@ -428,7 +433,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
 		)
 
 		for _, pod := range podsToCheck {
-			if updatedPods[pod.Name] {
+			if updatedPods.Has(pod.Name) {
 				continue
 			}
 
@@ -451,7 +456,7 @@ func (r *SingleClusterReconciler) ensurePodsImageUpdated(podsToCheck []*corev1.P
 				break
 			}
 
-			updatedPods[pod.Name] = true
+			updatedPods.Insert(pod.Name)
 
 			r.Log.Info("Pod is upgraded/downgraded", "podName", pod.Name)
 		}
@@ -638,12 +643,13 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 	return nil
 }
 
-// getIgnorablePods returns pods from racksToDelete that are currently not running and can be ignored in stability
-// checks.
+// getIgnorablePods returns pods:
+// 1. From racksToDelete that are currently not running and can be ignored in stability checks.
+// 2. User given pods in ignorePodList that are currently not running and can be ignored in stability checks.
 func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack) (
-	[]corev1.Pod, error,
+	sets.Set[string], error,
 ) {
-	var ignorablePods []corev1.Pod
+	ignorablePodNames := sets.Set[string]{}
 
 	for rackIdx := range racksToDelete {
 		rackPods, err := r.getRackPodList(racksToDelete[rackIdx].ID)
@@ -654,12 +660,26 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack)
 		for podIdx := range rackPods.Items {
 			pod := rackPods.Items[podIdx]
 			if !utils.IsPodRunningAndReady(&pod) {
-				ignorablePods = append(ignorablePods, pod)
+				ignorablePodNames.Insert(pod.Name)
 			}
 		}
 	}
 
-	return ignorablePods, nil
+	podList, err := r.getClusterPodList()
+	if err != nil {
+		return nil, err
+	}
+
+	userIgnorePodSet := sets.NewString(r.aeroCluster.Spec.IgnorePodList...)
+
+	for podIdx := range podList.Items {
+		pod := &podList.Items[podIdx]
+		if userIgnorePodSet.Has(pod.Name) && !utils.IsPodRunningAndReady(pod) {
+			ignorablePodNames.Insert(pod.Name)
+		}
+	}
+
+	return ignorablePodNames, nil
 }
 
 func (r *SingleClusterReconciler) getPodsPVCList(
diff --git a/controllers/rack.go b/controllers/rack.go
index 2c13fcd1b..0f4aded2b 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -12,6 +12,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	asdbv1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1"
@@ -44,19 +45,14 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 		rackIDsToDelete = append(rackIDsToDelete, racksToDelete[idx].ID)
 	}
 
-	ignorablePods, err := r.getIgnorablePods(racksToDelete)
+	ignorablePodNames, err := r.getIgnorablePods(racksToDelete)
 	if err != nil {
 		return reconcileError(err)
 	}
 
-	ignorablePodNames := make([]string, 0, len(ignorablePods))
-	for idx := range ignorablePods {
-		ignorablePodNames = append(ignorablePodNames, ignorablePods[idx].Name)
-	}
-
 	r.Log.Info(
 		"Rack changes", "racksToDelete", rackIDsToDelete, "ignorablePods",
-		ignorablePodNames,
+		ignorablePodNames.UnsortedList(),
 	)
 
 	// Handle failed racks
@@ -85,12 +81,12 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 			)
 		}
 
-		failedPods, _ := getFailedAndActivePods(podList)
+		failedPods, _ := getFailedAndActivePods(podList, ignorablePodNames)
 		if len(failedPods) != 0 {
 			r.Log.Info("Reconcile the failed pods in the Rack", "rackID", state.Rack.ID, "failedPods", failedPods)
 
 			if res = r.reconcileRack(
-				found, state, ignorablePods, failedPods,
+				found, state, ignorablePodNames, failedPods,
 			); !res.isSuccess {
 				return res
 			}
@@ -111,11 +107,11 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 			)
 		}
 
-		failedPods, _ = getFailedAndActivePods(podList)
+		failedPods, _ = getFailedAndActivePods(podList, ignorablePodNames)
 		if len(failedPods) != 0 {
 			r.Log.Info("Restart the failed pods in the Rack", "rackID", state.Rack.ID, "failedPods", failedPods)
 
-			if _, res = r.rollingRestartRack(found, state, ignorablePods, nil,
+			if _, res = r.rollingRestartRack(found, state, ignorablePodNames, nil,
 				failedPods); !res.isSuccess {
 				return res
 			}
@@ -149,7 +145,7 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 		} else {
 			// Reconcile other statefulset
 			if res = r.reconcileRack(
-				found, state, ignorablePods, nil,
+				found, state, ignorablePodNames, nil,
 			); !res.isSuccess {
 				return res
 			}
@@ -161,14 +157,14 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 		state := scaledDownRackList[idx].rackState
 		sts := scaledDownRackList[idx].rackSTS
 
-		if res = r.reconcileRack(sts, state, ignorablePods, nil); !res.isSuccess {
+		if res = r.reconcileRack(sts, state, ignorablePodNames, nil); !res.isSuccess {
 			return res
 		}
 	}
 
 	if len(r.aeroCluster.Status.RackConfig.Racks) != 0 {
 		// Remove removed racks
-		if res = r.deleteRacks(racksToDelete, ignorablePods); !res.isSuccess {
+		if res = r.deleteRacks(racksToDelete, ignorablePodNames); !res.isSuccess {
 			if res.err != nil {
 				r.Log.Error(
 					err, "Failed to remove statefulset for removed racks",
@@ -204,7 +200,7 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 		}
 
 		// Wait for pods to be ready.
-		if err := r.waitForSTSToBeReady(found); err != nil {
+		if err := r.waitForSTSToBeReady(found, ignorablePodNames); err != nil {
 			// If the wait times out try again.
 			// The wait is required in cases where scale up waits for a pod to
 			// terminate times out and event is re-queued.
@@ -291,7 +287,7 @@ func (r *SingleClusterReconciler) getRacksToDelete(rackStateList []RackState) (
 }
 
 func (r *SingleClusterReconciler) deleteRacks(
-	racksToDelete []asdbv1.Rack, ignorablePods []corev1.Pod,
+	racksToDelete []asdbv1.Rack, ignorablePodNames sets.Set[string],
 ) reconcileResult {
 	for idx := range racksToDelete {
 		rack := &racksToDelete[idx]
@@ -311,7 +307,7 @@ func (r *SingleClusterReconciler) deleteRacks(
 		// TODO: Add option for quick delete of rack. DefaultRackID should always be removed gracefully
 		rackState := &RackState{Size: 0, Rack: rack}
 
-		found, res := r.scaleDownRack(found, rackState, ignorablePods)
+		found, res := r.scaleDownRack(found, rackState, ignorablePodNames)
 		if !res.isSuccess {
 			return res
 		}
@@ -349,7 +345,7 @@ func (r *SingleClusterReconciler) deleteRacks(
 }
 
 func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.StatefulSet, rackState *RackState,
-	ignorablePods []corev1.Pod, failedPods []*corev1.Pod) (*appsv1.StatefulSet, reconcileResult) {
+	ignorablePodNames sets.Set[string], failedPods []*corev1.Pod) (*appsv1.StatefulSet, reconcileResult) {
 	var res reconcileResult
 	// Always update configMap. We won't be able to find if a rack's config, and it's pod config is in sync or not
 	// Checking rack.spec, rack.status will not work.
@@ -377,7 +373,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 	}
 
 	if upgradeNeeded {
-		found, res = r.upgradeRack(found, rackState, ignorablePods, failedPods)
+		found, res = r.upgradeRack(found, rackState, ignorablePodNames, failedPods)
 		if !res.isSuccess {
 			if res.err != nil {
 				r.Log.Error(
@@ -402,7 +398,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 		}
 
 		if needRollingRestartRack {
-			found, res = r.rollingRestartRack(found, rackState, ignorablePods, restartTypeMap, failedPods)
+			found, res = r.rollingRestartRack(found, rackState, ignorablePodNames, restartTypeMap, failedPods)
 			if !res.isSuccess {
 				if res.err != nil {
 					r.Log.Error(
@@ -427,7 +423,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 }
 
 func (r *SingleClusterReconciler) reconcileRack(
-	found *appsv1.StatefulSet, rackState *RackState, ignorablePods []corev1.Pod, failedPods []*corev1.Pod,
+	found *appsv1.StatefulSet, rackState *RackState, ignorablePodNames sets.Set[string], failedPods []*corev1.Pod,
 ) reconcileResult {
 	r.Log.Info(
 		"Reconcile existing Aerospike cluster statefulset", "stsName",
@@ -446,7 +442,7 @@ func (r *SingleClusterReconciler) reconcileRack(
 
 	// Scale down
 	if currentSize > desiredSize {
-		found, res = r.scaleDownRack(found, rackState, ignorablePods)
+		found, res = r.scaleDownRack(found, rackState, ignorablePodNames)
 		if !res.isSuccess {
 			if res.err != nil {
 				r.Log.Error(
@@ -491,7 +487,7 @@ func (r *SingleClusterReconciler) reconcileRack(
 		return reconcileError(err)
 	}
 
-	found, res = r.upgradeOrRollingRestartRack(found, rackState, ignorablePods, failedPods)
+	found, res = r.upgradeOrRollingRestartRack(found, rackState, ignorablePodNames, failedPods)
 	if !res.isSuccess {
 		return res
 	}
@@ -621,7 +617,7 @@ func (r *SingleClusterReconciler) scaleUpRack(found *appsv1.StatefulSet, rackSta
 }
 
 func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, rackState *RackState,
-	ignorablePods []corev1.Pod, failedPods []*corev1.Pod) (*appsv1.StatefulSet, reconcileResult) {
+	ignorablePodNames sets.Set[string], failedPods []*corev1.Pod) (*appsv1.StatefulSet, reconcileResult) {
 	var (
 		err     error
 		podList []*corev1.Pod
@@ -706,7 +702,7 @@ func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, r
 			"[rack-%d] Updating Containers on Pods %v", rackState.Rack.ID, podNames,
 		)
 
-		res := r.safelyDeletePodsAndEnsureImageUpdated(rackState, podsBatch, ignorablePods)
+		res := r.safelyDeletePodsAndEnsureImageUpdated(rackState, podsBatch, ignorablePodNames)
 		if !res.isSuccess {
 			return statefulSet, res
 		}
@@ -737,7 +733,7 @@ func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, r
 }
 
 func (r *SingleClusterReconciler) scaleDownRack(
-	found *appsv1.StatefulSet, rackState *RackState, ignorablePods []corev1.Pod,
+	found *appsv1.StatefulSet, rackState *RackState, ignorablePodNames sets.Set[string],
 ) (*appsv1.StatefulSet, reconcileResult) {
 	desiredSize := int32(rackState.Size)
 
@@ -782,7 +778,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 	// Ignore safe stop check if pod is not running.
 	// Ignore migrate-fill-delay if pod is not running. Deleting this pod will not lead to any migration.
 	if isPodRunningAndReady {
-		if res := r.waitForMultipleNodesSafeStopReady([]*corev1.Pod{pod}, ignorablePods); !res.isSuccess {
+		if res := r.waitForMultipleNodesSafeStopReady([]*corev1.Pod{pod}, ignorablePodNames); !res.isSuccess {
 			// The pod is running and is unsafe to terminate.
 			return found, res
 		}
@@ -793,7 +789,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 		// setting migrate-fill-delay will fail if there are any failed pod
 		if res := r.setMigrateFillDelay(
 			policy, &rackState.Rack.AerospikeConfig, true,
-			append(ignorablePods, *pod),
+			ignorablePodNames.Insert(pod.Name),
 		); !res.isSuccess {
 			return found, res
 		}
@@ -818,7 +814,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 	// These checks will fail if there is any other pod in failed state.
 	if isPodRunningAndReady {
 		// Wait for pods to get terminated
-		if err = r.waitForSTSToBeReady(found); err != nil {
+		if err = r.waitForSTSToBeReady(found, ignorablePodNames); err != nil {
 			r.Log.Error(err, "Failed to wait for statefulset to be ready")
 			return found, reconcileRequeueAfter(1)
 		}
@@ -828,7 +824,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 		// This can be left to the user but if we would do it here on our own then we can reuse
 		// objects like pvc and service. These objects would have been removed if scaleup is left for the user.
 		// In case of rolling restart, no pod cleanup happens, therefore rolling config back is left to the user.
-		if err = r.validateSCClusterState(policy, ignorablePods); err != nil {
+		if err = r.validateSCClusterState(policy, ignorablePodNames); err != nil {
 			// reset cluster size
 			newSize := *found.Spec.Replicas + 1
 			found.Spec.Replicas = &newSize
@@ -890,7 +886,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 }
 
 func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet, rackState *RackState,
-	ignorablePods []corev1.Pod, restartTypeMap map[string]RestartType,
+	ignorablePodNames sets.Set[string], restartTypeMap map[string]RestartType,
 	failedPods []*corev1.Pod) (*appsv1.StatefulSet, reconcileResult) {
 	r.Log.Info("Rolling restart AerospikeCluster statefulset pods")
 
@@ -990,7 +986,7 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet,
 			return nil, reconcileError(err)
 		}
 
-		if res := r.rollingRestartPods(rackState, podsBatch, ignorablePods, restartTypeMap); !res.isSuccess {
+		if res := r.rollingRestartPods(rackState, podsBatch, ignorablePodNames, restartTypeMap); !res.isSuccess {
 			return found, res
 		}
 
diff --git a/controllers/reconciler.go b/controllers/reconciler.go
index 050c0123b..796147f1a 100644
--- a/controllers/reconciler.go
+++ b/controllers/reconciler.go
@@ -124,9 +124,16 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 		return reconcile.Result{}, err
 	}
 
+	ignorablePodNames, err := r.getIgnorablePods(nil)
+	if err != nil {
+		r.Log.Error(err, "Failed to determine pods to be ignored")
+
+		return reconcile.Result{}, err
+	}
+
 	// Check if there is any node with quiesce status. We need to undo that
 	// It may have been left from previous steps
-	allHostConns, err := r.newAllHostConn()
+	allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		e := fmt.Errorf(
 			"failed to get hostConn for aerospike cluster nodes: %v", err,
@@ -146,7 +153,7 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 	}
 
 	// Setup access control.
-	if err := r.validateAndReconcileAccessControl(); err != nil {
+	if err := r.validateAndReconcileAccessControl(ignorablePodNames); err != nil {
 		r.Log.Error(err, "Failed to Reconcile access control")
 		r.Recorder.Eventf(
 			r.aeroCluster, corev1.EventTypeWarning, "ACLUpdateFailed",
@@ -177,7 +184,7 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 	// Redundant safe check to revert migrate-fill-delay if previous revert operation missed/skipped somehow
 	if res := r.setMigrateFillDelay(
 		policy, &r.aeroCluster.Spec.RackConfig.Racks[0].AerospikeConfig,
-		false, nil,
+		false, ignorablePodNames,
 	); !res.isSuccess {
 		r.Log.Error(res.err, "Failed to revert migrate-fill-delay")
 		return reconcile.Result{}, res.err
@@ -191,7 +198,7 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 		}
 
 		// Setup roster
-		if err := r.getAndSetRoster(policy, r.aeroCluster.Spec.RosterNodeBlockList, nil); err != nil {
+		if err := r.getAndSetRoster(policy, r.aeroCluster.Spec.RosterNodeBlockList, ignorablePodNames); err != nil {
 			r.Log.Error(err, "Failed to set roster for cluster")
 			return reconcile.Result{}, err
 		}
@@ -212,7 +219,7 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 	return reconcile.Result{}, nil
 }
 
-func (r *SingleClusterReconciler) validateAndReconcileAccessControl() error {
+func (r *SingleClusterReconciler) validateAndReconcileAccessControl(ignorablePodNames sets.Set[string]) error {
 	version, err := asdbv1.GetImageVersion(r.aeroCluster.Spec.Image)
 	if err != nil {
 		return err
@@ -231,7 +238,7 @@ func (r *SingleClusterReconciler) validateAndReconcileAccessControl() error {
 	}
 
 	// Create client
-	conns, err := r.newAllHostConn()
+	conns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		return fmt.Errorf("failed to get host info: %v", err)
 	}
diff --git a/controllers/statefulset.go b/controllers/statefulset.go
index b4247c9b8..1d36c7596 100644
--- a/controllers/statefulset.go
+++ b/controllers/statefulset.go
@@ -192,7 +192,7 @@ func (r *SingleClusterReconciler) createSTS(
 		"StatefulSet.Name", st.Name,
 	)
 
-	if err := r.waitForSTSToBeReady(st); err != nil {
+	if err := r.waitForSTSToBeReady(st, nil); err != nil {
 		return st, fmt.Errorf(
 			"failed to wait for statefulset to be ready: %v", err,
 		)
@@ -209,7 +209,9 @@ func (r *SingleClusterReconciler) deleteSTS(st *appsv1.StatefulSet) error {
 	return r.Client.Delete(context.TODO(), st)
 }
 
-func (r *SingleClusterReconciler) waitForSTSToBeReady(st *appsv1.StatefulSet) error {
+func (r *SingleClusterReconciler) waitForSTSToBeReady(
+	st *appsv1.StatefulSet, ignorablePodNames sets.Set[string],
+) error {
 	const (
 		podStatusMaxRetry      = 18
 		podStatusRetryInterval = time.Second * 10
@@ -223,6 +225,9 @@ func (r *SingleClusterReconciler) waitForSTSToBeReady(st *appsv1.StatefulSet) er
 	var podIndex int32
 	for podIndex = 0; podIndex < *st.Spec.Replicas; podIndex++ {
 		podName := getSTSPodName(st.Name, podIndex)
+		if ignorablePodNames.Has(podName) {
+			continue
+		}
 
 		var isReady bool
 
@@ -1003,7 +1008,7 @@ func updateSTSContainers(
 	return stsContainers[:idx]
 }
 
-func (r *SingleClusterReconciler) waitForAllSTSToBeReady() error {
+func (r *SingleClusterReconciler) waitForAllSTSToBeReady(ignorablePodNames sets.Set[string]) error {
 	r.Log.Info("Waiting for cluster to be ready")
 
 	allRackIDs := sets.NewInt()
@@ -1032,7 +1037,7 @@ func (r *SingleClusterReconciler) waitForAllSTSToBeReady() error {
 			continue
 		}
 
-		if err := r.waitForSTSToBeReady(st); err != nil {
+		if err := r.waitForSTSToBeReady(st, ignorablePodNames); err != nil {
 			return err
 		}
 	}
diff --git a/controllers/strong_consistency.go b/controllers/strong_consistency.go
index 3ca48582d..7ca99ac66 100644
--- a/controllers/strong_consistency.go
+++ b/controllers/strong_consistency.go
@@ -1,8 +1,8 @@
 package controllers
 
 import (
-	sets "github.com/deckarep/golang-set/v2"
-	corev1 "k8s.io/api/core/v1"
+	gosets "github.com/deckarep/golang-set/v2"
+	"k8s.io/apimachinery/pkg/util/sets"
 
 	as "github.com/aerospike/aerospike-client-go/v6"
 	asdbv1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1"
@@ -11,9 +11,9 @@ import (
 
 func (r *SingleClusterReconciler) getAndSetRoster(
 	policy *as.ClientPolicy, rosterNodeBlockList []string,
-	ignorablePods []corev1.Pod,
+	ignorablePodNames sets.Set[string],
 ) error {
-	allHostConns, err := r.newAllHostConnWithOption(ignorablePods)
+	allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		return err
 	}
@@ -26,8 +26,9 @@ func (r *SingleClusterReconciler) getAndSetRoster(
 	return deployment.GetAndSetRoster(r.Log, allHostConns, policy, rosterNodeBlockList, ignorableNamespaces)
 }
 
-func (r *SingleClusterReconciler) validateSCClusterState(policy *as.ClientPolicy, ignorablePods []corev1.Pod) error {
-	allHostConns, err := r.newAllHostConnWithOption(ignorablePods)
+func (r *SingleClusterReconciler) validateSCClusterState(policy *as.ClientPolicy, ignorablePodNames sets.Set[string],
+) error {
+	allHostConns, err := r.newAllHostConnWithOption(ignorablePodNames)
 	if err != nil {
 		return err
 	}
@@ -42,8 +43,8 @@ func (r *SingleClusterReconciler) validateSCClusterState(policy *as.ClientPolicy
 
 func (r *SingleClusterReconciler) addedSCNamespaces(nodesNamespaces map[string][]string) []string {
 	var (
-		specSCNamespaces     = sets.NewSet[string]()
-		newAddedSCNamespaces = sets.NewSet[string]()
+		specSCNamespaces     = gosets.NewSet[string]()
+		newAddedSCNamespaces = gosets.NewSet[string]()
 	)
 
 	// Look inside only 1st rack. SC namespaces should be same across all the racks
@@ -58,7 +59,7 @@ func (r *SingleClusterReconciler) addedSCNamespaces(nodesNamespaces map[string][
 
 	// Check if SC namespaces are present in all node's namespaces, if not then it's a new SC namespace
 	for _, namespaces := range nodesNamespaces {
-		nodeNamespaces := sets.NewSet[string](namespaces...)
+		nodeNamespaces := gosets.NewSet[string](namespaces...)
 		newAddedSCNamespaces.Append(specSCNamespaces.Difference(nodeNamespaces).ToSlice()...)
 	}
 
@@ -66,7 +67,7 @@ func (r *SingleClusterReconciler) addedSCNamespaces(nodesNamespaces map[string][
 }
 
 func (r *SingleClusterReconciler) getIgnorableNamespaces(allHostConns []*deployment.HostConn) (
-	sets.Set[string], error) {
+	gosets.Set[string], error) {
 	nodesNamespaces, err := deployment.GetClusterNamespaces(r.Log, r.getClientPolicy(), allHostConns)
 	if err != nil {
 		return nil, err
@@ -75,7 +76,7 @@ func (r *SingleClusterReconciler) getIgnorableNamespaces(allHostConns []*deploym
 	removedNamespaces := r.removedNamespaces(nodesNamespaces)
 	addSCNamespaces := r.addedSCNamespaces(nodesNamespaces)
 
-	ignorableNamespaces := sets.NewSet[string](removedNamespaces...)
+	ignorableNamespaces := gosets.NewSet[string](removedNamespaces...)
 	ignorableNamespaces.Append(addSCNamespaces...)
 
 	return ignorableNamespaces, nil
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index a60371e1c..06b00dca3 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -277,6 +277,16 @@ spec:
                     - customInterface
                     type: string
                 type: object
+              ignorePodList:
+                description: IgnorePodList is the list of pods which are ignored by
+                  the operator while checking the cluster stability and are not considered
+                  part of cluster. This is only useful when there are some failed
+                  pods and operator is required to do some operation on the cluster.
+                  If pods in running state are defined in this list, they are not
+                  ignored
+                items:
+                  type: string
+                type: array
               image:
                 description: Aerospike server image
                 type: string
@@ -8886,6 +8896,16 @@ spec:
                     - customInterface
                     type: string
                 type: object
+              ignorePodList:
+                description: IgnorePodList is the list of pods which are ignored by
+                  the operator while checking the cluster stability and are not considered
+                  part of cluster. This is only useful when there are some failed
+                  pods and operator is required to do some operation on the cluster.
+                  If pods in running state are defined in this list, they are not
+                  ignored
+                items:
+                  type: string
+                type: array
               image:
                 description: Aerospike server image
                 type: string

From 614bdc376dd9969f0b8b602f756abf5a416ea8c1 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Mon, 20 Nov 2023 18:09:01 +0530
Subject: [PATCH 02/17] Added test-cases

---
 api/v1/aerospikecluster_types.go              |  4 +-
 .../asdb.aerospike.com_aerospikeclusters.yaml |  4 +-
 controllers/pod.go                            |  9 +-
 controllers/rack.go                           | 12 ++-
 ..._aerospikeclusters.asdb.aerospike.com.yaml |  4 +-
 test/cluster_test.go                          | 91 +++++++++++++++++++
 6 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 0846f4ead..f4f50ad16 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -76,7 +76,7 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
 	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
 	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
-	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored
+	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored.
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Ignore Pod List"
 	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
@@ -623,7 +623,7 @@ type AerospikeClusterStatusSpec struct { //nolint:govet // for readability
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
 	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
 	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
-	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored
+	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored.
 	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index 06b00dca3..30b71f75a 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -283,7 +283,7 @@ spec:
                   part of cluster. This is only useful when there are some failed
                   pods and operator is required to do some operation on the cluster.
                   If pods in running state are defined in this list, they are not
-                  ignored
+                  ignored.
                 items:
                   type: string
                 type: array
@@ -8902,7 +8902,7 @@ spec:
                   part of cluster. This is only useful when there are some failed
                   pods and operator is required to do some operation on the cluster.
                   If pods in running state are defined in this list, they are not
-                  ignored
+                  ignored.
                 items:
                   type: string
                 type: array
diff --git a/controllers/pod.go b/controllers/pod.go
index b86aab7a6..a43c604b1 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -645,7 +645,7 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 
 // getIgnorablePods returns pods:
 // 1. From racksToDelete that are currently not running and can be ignored in stability checks.
-// 2. User given pods in ignorePodList that are currently not running and can be ignored in stability checks.
+// 2. User given pods in ignorePodList that are currently not running and can be ignored from stability checks.
 func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack) (
 	sets.Set[string], error,
 ) {
@@ -726,9 +726,14 @@ func (r *SingleClusterReconciler) getClusterPodList() (
 	return podList, nil
 }
 
-func (r *SingleClusterReconciler) isAnyPodInImageFailedState(podList []corev1.Pod) bool {
+func (r *SingleClusterReconciler) isAnyPodInImageFailedState(podList []corev1.Pod, ignorablePodNames sets.Set[string],
+) bool {
 	for idx := range podList {
 		pod := &podList[idx]
+		if ignorablePodNames.Has(pod.Name) {
+			continue
+		}
+
 		// TODO: Should we use checkPodFailed or CheckPodImageFailed?
 		// scaleDown, rollingRestart should work even if node is crashed
 		// If node was crashed due to wrong config then only rollingRestart can bring it back.
diff --git a/controllers/rack.go b/controllers/rack.go
index 0f4aded2b..096cdccb0 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -495,7 +495,7 @@ func (r *SingleClusterReconciler) reconcileRack(
 	// Scale up after upgrading, so that new pods come up with new image
 	currentSize = *found.Spec.Replicas
 	if currentSize < desiredSize {
-		found, res = r.scaleUpRack(found, rackState)
+		found, res = r.scaleUpRack(found, rackState, ignorablePodNames)
 		if !res.isSuccess {
 			r.Log.Error(
 				res.err, "Failed to scaleUp StatefulSet pods", "stsName",
@@ -531,7 +531,9 @@ func (r *SingleClusterReconciler) reconcileRack(
 	return reconcileSuccess()
 }
 
-func (r *SingleClusterReconciler) scaleUpRack(found *appsv1.StatefulSet, rackState *RackState) (
+func (r *SingleClusterReconciler) scaleUpRack(
+	found *appsv1.StatefulSet, rackState *RackState, ignorablePodNames sets.Set[string],
+) (
 	*appsv1.StatefulSet, reconcileResult,
 ) {
 	desiredSize := int32(rackState.Size)
@@ -552,7 +554,7 @@ func (r *SingleClusterReconciler) scaleUpRack(found *appsv1.StatefulSet, rackSta
 		return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
 	}
 
-	if r.isAnyPodInImageFailedState(podList.Items) {
+	if r.isAnyPodInImageFailedState(podList.Items, ignorablePodNames) {
 		return found, reconcileError(fmt.Errorf("cannot scale up AerospikeCluster. A pod is already in failed state"))
 	}
 
@@ -758,7 +760,7 @@ func (r *SingleClusterReconciler) scaleDownRack(
 		return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
 	}
 
-	if r.isAnyPodInImageFailedState(oldPodList.Items) {
+	if r.isAnyPodInImageFailedState(oldPodList.Items, ignorablePodNames) {
 		return found, reconcileError(fmt.Errorf("cannot scale down AerospikeCluster. A pod is already in failed state"))
 	}
 
@@ -920,7 +922,7 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet,
 		pods = append(pods, *podList[idx])
 	}
 
-	if len(failedPods) != 0 && r.isAnyPodInImageFailedState(pods) {
+	if len(failedPods) != 0 && r.isAnyPodInImageFailedState(pods, ignorablePodNames) {
 		return found, reconcileError(
 			fmt.Errorf(
 				"cannot Rolling restart AerospikeCluster. " +
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index 06b00dca3..30b71f75a 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -283,7 +283,7 @@ spec:
                   part of cluster. This is only useful when there are some failed
                   pods and operator is required to do some operation on the cluster.
                   If pods in running state are defined in this list, they are not
-                  ignored
+                  ignored.
                 items:
                   type: string
                 type: array
@@ -8902,7 +8902,7 @@ spec:
                   part of cluster. This is only useful when there are some failed
                   pods and operator is required to do some operation on the cluster.
                   If pods in running state are defined in this list, they are not
-                  ignored
+                  ignored.
                 items:
                   type: string
                 type: array
diff --git a/test/cluster_test.go b/test/cluster_test.go
index d5af68e05..9836a166b 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -45,6 +45,11 @@ var _ = Describe(
 		// 		DeployClusterWithSyslog(ctx)
 		// 	},
 		// )
+		Context(
+			"DeployClusterWithSyslog", func() {
+				clusterWithIgnorePodList(ctx)
+			},
+		)
 		Context(
 			"CommonNegativeClusterValidationTest", func() {
 				NegativeClusterValidationTest(ctx)
@@ -124,6 +129,92 @@ func ScaleDownWithMigrateFillDelay(ctx goctx.Context) {
 	)
 }
 
+func clusterWithIgnorePodList(ctx goctx.Context) {
+	Context(
+		"UpdateClusterWithIgnorePodList", func() {
+			clusterNamespacedName := getNamespacedName(
+				"ignore-pod-cluster", namespace,
+			)
+
+			var (
+				aeroCluster *asdbv1.AerospikeCluster
+				err         error
+			)
+
+			BeforeEach(
+				func() {
+					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4)
+					racks := getDummyRackConf(1, 2)
+					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks}
+					err = deployCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+
+			AfterEach(
+				func() {
+					err = deleteCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+
+			It(
+				"Should allow cluster operations with failed pods", func() {
+					By("Fail 2-0 aerospike pod")
+					pod := &v1.Pod{}
+					ignorePodName := clusterNamespacedName.Name + "-2-0"
+
+					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					// This will lead to pod 2-0 pod in failed state
+					pod.Spec.Containers[0].Image = "wrong-image"
+					err = k8sClient.Update(ctx, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Set IgnorePodList and scale down 1 pod")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
+					aeroCluster.Spec.Size--
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Rolling restart cluster")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.AerospikeConfig.Value["service"].(map[string]interface{})["proto-fd-max"] = int64(18000)
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Upgrade version")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.Image = baseImage + ":6.4.0.4"
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Scale up")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
+					aeroCluster.Spec.Size++
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Verify pod 2-0 is still in failed state")
+					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
+					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
+				},
+			)
+		},
+	)
+}
+
 // Test cluster deployment with all image post 4.9.0
 func DeployClusterForAllImagesPost490(ctx goctx.Context) {
 	// post 4.9.0, need feature-key file

From 35cef30c775d5a7081231f56fbf54abba601c883 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Thu, 23 Nov 2023 13:43:05 +0530
Subject: [PATCH 03/17] Resolved review comments

---
 api/v1/aerospikecluster_types.go              |  2 +-
 ...rnetes-operator.clusterserviceversion.yaml |  4 ++--
 controllers/aero_info_calls.go                |  9 +++++----
 test/cluster_test.go                          | 20 ++++++++++++++++---
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index f4f50ad16..e7b01dcd3 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -72,7 +72,7 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Seeds Finder Services"
 	SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"`
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
-	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster NodeB lockList"
+	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster Node BlockList"
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
 	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
 	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
diff --git a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
index 795d2ec4e..2b8c3c42e 100644
--- a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
@@ -51,7 +51,7 @@ spec:
           while checking the cluster stability and are not considered part of cluster.
           This is only useful when there are some failed pods and operator is required
           to do some operation on the cluster. If pods in running state are defined
-          in this list, they are not ignored
+          in this list, they are not ignored.
         displayName: Ignore Pod List
         path: ignorePodList
       - description: Aerospike server image
@@ -69,7 +69,7 @@ spec:
         path: rackConfig
       - description: RosterNodeBlockList is a list of blocked nodeIDs from roster
           in a strong-consistency setup
-        displayName: Roster NodeB lockList
+        displayName: Roster Node BlockList
         path: rosterNodeBlockList
       - description: SeedsFinderServices creates additional Kubernetes service that
           allow clients to discover Aerospike cluster nodes.
diff --git a/controllers/aero_info_calls.go b/controllers/aero_info_calls.go
index 0c8ca967c..093da495b 100644
--- a/controllers/aero_info_calls.go
+++ b/controllers/aero_info_calls.go
@@ -30,10 +30,11 @@ import (
 // Aerospike helper
 // ------------------------------------------------------------------------------------
 
-// waitForMultipleNodesSafeStopReady waits util the input pods is safe to stop,
-// skipping pods that are not running and present in ignorablePods for stability check.
-// The ignorablePods list should be a list of failed or pending pods that are going to be
-// deleted eventually and are safe to ignore in stability checks.
+// waitForMultipleNodesSafeStopReady waits until the input pods are safe to stop,
+// skipping pods that are not running and present in ignorablePodNames for stability check.
+// The ignorablePodNames is the list of failed or pending pods that are either::
+// 1. going to be deleted eventually and are safe to ignore in stability checks
+// 2. given in ignorePodList by the user and are safe to ignore in stability checks
 func (r *SingleClusterReconciler) waitForMultipleNodesSafeStopReady(
 	pods []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) reconcileResult {
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 9836a166b..5da007628 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -46,7 +46,7 @@ var _ = Describe(
 		// 	},
 		// )
 		Context(
-			"DeployClusterWithSyslog", func() {
+			"DeployClusterWithIgnorePodList", func() {
 				clusterWithIgnorePodList(ctx)
 			},
 		)
@@ -191,14 +191,14 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					By("Upgrade version")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.Image = baseImage + ":6.4.0.4"
+					newImage := baseImage + ":6.4.0.4"
+					aeroCluster.Spec.Image = newImage
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
 					By("Scale up")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
 					aeroCluster.Spec.Size++
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
@@ -209,6 +209,20 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					Expect(err).ToNot(HaveOccurred())
 					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
 					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
+
+					By("Remove pod from IgnorePodList and verify pod 2-0 is in running state")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.IgnorePodList = []string{}
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeTrue())
+					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeTrue())
+					Expect(pod.Spec.Containers[0].Image).To(Equal(newImage))
 				},
 			)
 		},

From 4fedc437558768dff5d81e76873e42e48e64e803 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Mon, 27 Nov 2023 18:37:08 +0530
Subject: [PATCH 04/17] Fix minor bug

---
 controllers/pod.go  |  8 ++++++--
 controllers/rack.go | 23 ++++++++++++++---------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/controllers/pod.go b/controllers/pod.go
index a43c604b1..0cd28b466 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -51,7 +51,7 @@ func mergeRestartType(current, incoming RestartType) RestartType {
 
 // Fetching RestartType of all pods, based on the operation being performed.
 func (r *SingleClusterReconciler) getRollingRestartTypeMap(
-	rackState *RackState, pods []*corev1.Pod,
+	rackState *RackState, pods []*corev1.Pod, ignorablePodNames sets.Set[string],
 ) (map[string]RestartType, error) {
 	var addedNSDevices []string
 
@@ -65,9 +65,13 @@ func (r *SingleClusterReconciler) getRollingRestartTypeMap(
 	requiredConfHash := confMap.Data[aerospikeConfHashFileName]
 
 	for idx := range pods {
+		if ignorablePodNames.Has(pods[idx].Name) {
+			continue
+		}
+
 		podStatus := r.aeroCluster.Status.Pods[pods[idx].Name]
 		if addedNSDevices == nil && podStatus.AerospikeConfigHash != requiredConfHash {
-			// Fetching all block devices that has been added in namespaces.
+			// Fetching all block devices that have been added in namespaces.
 			addedNSDevices, err = r.getNSAddedDevices(rackState)
 			if err != nil {
 				return nil, err
diff --git a/controllers/rack.go b/controllers/rack.go
index 096cdccb0..9493eddf1 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -350,7 +350,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 	// Always update configMap. We won't be able to find if a rack's config, and it's pod config is in sync or not
 	// Checking rack.spec, rack.status will not work.
 	// We may change config, let some pods restart with new config and then change config back to original value.
-	// Now rack.spec, rack.status will be same but few pods will have changed config.
+	// Now rack.spec, rack.status will be same, but few pods will have changed config.
 	// So a check based on spec and status will skip configMap update.
 	// Hence, a rolling restart of pod will never bring pod to desired config
 	if err := r.updateSTSConfigMap(
@@ -367,7 +367,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 	}
 
 	// Upgrade
-	upgradeNeeded, err := r.isRackUpgradeNeeded(rackState.Rack.ID)
+	upgradeNeeded, err := r.isRackUpgradeNeeded(rackState.Rack.ID, ignorablePodNames)
 	if err != nil {
 		return found, reconcileError(err)
 	}
@@ -392,7 +392,7 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 			return found, res
 		}
 	} else {
-		var needRollingRestartRack, restartTypeMap, nErr = r.needRollingRestartRack(rackState)
+		var needRollingRestartRack, restartTypeMap, nErr = r.needRollingRestartRack(rackState, ignorablePodNames)
 		if nErr != nil {
 			return found, reconcileError(nErr)
 		}
@@ -513,7 +513,7 @@ func (r *SingleClusterReconciler) reconcileRack(
 		}
 	}
 
-	// All regular operation are complete. Take time and cleanup dangling nodes that have not been cleaned up
+	// All regular operations are complete. Take time and cleanup dangling nodes that have not been cleaned up
 	// previously due to errors.
 	if err := r.cleanupDanglingPodsRack(found, rackState); err != nil {
 		return reconcileError(err)
@@ -654,7 +654,7 @@ func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, r
 		)
 	}
 
-	// Find pods which needs to be updated
+	// Find pods which need to be updated
 	podsToUpgrade := make([]*corev1.Pod, 0, len(podList))
 
 	for idx := range podList {
@@ -942,7 +942,7 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet,
 		"Statefulset spec updated - doing rolling restart",
 	)
 
-	// Find pods which needs restart
+	// Find pods which need restart
 	podsToRestart := make([]*corev1.Pod, 0, len(podList))
 
 	for idx := range podList {
@@ -1013,7 +1013,7 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet,
 	return found, reconcileSuccess()
 }
 
-func (r *SingleClusterReconciler) needRollingRestartRack(rackState *RackState) (
+func (r *SingleClusterReconciler) needRollingRestartRack(rackState *RackState, ignorablePodNames sets.Set[string]) (
 	needRestart bool, restartTypeMap map[string]RestartType, err error,
 ) {
 	podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
@@ -1021,7 +1021,7 @@ func (r *SingleClusterReconciler) needRollingRestartRack(rackState *RackState) (
 		return false, nil, fmt.Errorf("failed to list pods: %v", err)
 	}
 
-	restartTypeMap, err = r.getRollingRestartTypeMap(rackState, podList)
+	restartTypeMap, err = r.getRollingRestartTypeMap(rackState, podList, ignorablePodNames)
 	if err != nil {
 		return false, nil, err
 	}
@@ -1035,7 +1035,7 @@ func (r *SingleClusterReconciler) needRollingRestartRack(rackState *RackState) (
 	return false, nil, nil
 }
 
-func (r *SingleClusterReconciler) isRackUpgradeNeeded(rackID int) (
+func (r *SingleClusterReconciler) isRackUpgradeNeeded(rackID int, ignorablePodNames sets.Set[string]) (
 	bool, error,
 ) {
 	podList, err := r.getRackPodList(rackID)
@@ -1045,6 +1045,11 @@ func (r *SingleClusterReconciler) isRackUpgradeNeeded(rackID int) (
 
 	for idx := range podList.Items {
 		pod := &podList.Items[idx]
+
+		if ignorablePodNames.Has(pod.Name) {
+			continue
+		}
+
 		if !r.isPodOnDesiredImage(pod, true) {
 			r.Log.Info("Pod needs upgrade/downgrade", "podName", pod.Name)
 			return true, nil

From f03adebea75f3073f5e887b36a70c142b39890a9 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Mon, 27 Nov 2023 21:42:50 +0530
Subject: [PATCH 05/17] Added test-case

---
 api/v1/aerospikecluster_types.go              | 14 +--
 .../asdb.aerospike.com_aerospikeclusters.yaml | 24 ++---
 controllers/rack.go                           | 10 ++
 ..._aerospikeclusters.asdb.aerospike.com.yaml | 24 ++---
 test/cluster_test.go                          | 92 ++++++++++++++-----
 5 files changed, 109 insertions(+), 55 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index e7b01dcd3..bcbb7cd37 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -74,9 +74,10 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster Node BlockList"
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
-	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
-	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
-	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored.
+	// IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability.
+	// Pods specified in this list are not considered part of the cluster. This is particularly useful when
+	// there are failed pods and the operator needs to perform certain operations on the cluster. Note that
+	// running pods included in this list will not be ignored.
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Ignore Pod List"
 	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
@@ -621,9 +622,10 @@ type AerospikeClusterStatusSpec struct { //nolint:govet // for readability
 	SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"`
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
-	// IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and
-	// are not considered part of cluster. This is only useful when there are some failed pods and operator is required
-	// to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored.
+	// IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability.
+	// Pods specified in this list are not considered part of the cluster. This is particularly useful when
+	// there are failed pods and the operator needs to perform certain operations on the cluster. Note that
+	// running pods included in this list will not be ignored.
 	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index 30b71f75a..6d64a64b9 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -278,12 +278,12 @@ spec:
                     type: string
                 type: object
               ignorePodList:
-                description: IgnorePodList is the list of pods which are ignored by
-                  the operator while checking the cluster stability and are not considered
-                  part of cluster. This is only useful when there are some failed
-                  pods and operator is required to do some operation on the cluster.
-                  If pods in running state are defined in this list, they are not
-                  ignored.
+                description: IgnorePodList is a list of pods that the operator will
+                  ignore while assessing cluster stability. Pods specified in this
+                  list are not considered part of the cluster. This is particularly
+                  useful when there are failed pods and the operator needs to perform
+                  certain operations on the cluster. Note that running pods included
+                  in this list will not be ignored.
                 items:
                   type: string
                 type: array
@@ -8897,12 +8897,12 @@ spec:
                     type: string
                 type: object
               ignorePodList:
-                description: IgnorePodList is the list of pods which are ignored by
-                  the operator while checking the cluster stability and are not considered
-                  part of cluster. This is only useful when there are some failed
-                  pods and operator is required to do some operation on the cluster.
-                  If pods in running state are defined in this list, they are not
-                  ignored.
+                description: IgnorePodList is a list of pods that the operator will
+                  ignore while assessing cluster stability. Pods specified in this
+                  list are not considered part of the cluster. This is particularly
+                  useful when there are failed pods and the operator needs to perform
+                  certain operations on the cluster. Note that running pods included
+                  in this list will not be ignored.
                 items:
                   type: string
                 type: array
diff --git a/controllers/rack.go b/controllers/rack.go
index 9493eddf1..263a4e380 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -661,6 +661,11 @@ func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, r
 		pod := podList[idx]
 		r.Log.Info("Check if pod needs upgrade or not", "podName", pod.Name)
 
+		if ignorablePodNames.Has(pod.Name) {
+			r.Log.Info("Pod found in ignore pod list, skipping", "podName", pod.Name)
+			continue
+		}
+
 		if r.isPodUpgraded(pod) {
 			r.Log.Info("Pod doesn't need upgrade", "podName", pod.Name)
 			continue
@@ -948,6 +953,11 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet,
 	for idx := range podList {
 		pod := podList[idx]
 
+		if ignorablePodNames.Has(pod.Name) {
+			r.Log.Info("Pod found in ignore pod list, skipping", "podName", pod.Name)
+			continue
+		}
+
 		restartType := restartTypeMap[pod.Name]
 		if restartType == noRestart {
 			r.Log.Info("This Pod doesn't need rolling restart, Skip this", "pod", pod.Name)
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index 30b71f75a..6d64a64b9 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -278,12 +278,12 @@ spec:
                     type: string
                 type: object
               ignorePodList:
-                description: IgnorePodList is the list of pods which are ignored by
-                  the operator while checking the cluster stability and are not considered
-                  part of cluster. This is only useful when there are some failed
-                  pods and operator is required to do some operation on the cluster.
-                  If pods in running state are defined in this list, they are not
-                  ignored.
+                description: IgnorePodList is a list of pods that the operator will
+                  ignore while assessing cluster stability. Pods specified in this
+                  list are not considered part of the cluster. This is particularly
+                  useful when there are failed pods and the operator needs to perform
+                  certain operations on the cluster. Note that running pods included
+                  in this list will not be ignored.
                 items:
                   type: string
                 type: array
@@ -8897,12 +8897,12 @@ spec:
                     type: string
                 type: object
               ignorePodList:
-                description: IgnorePodList is the list of pods which are ignored by
-                  the operator while checking the cluster stability and are not considered
-                  part of cluster. This is only useful when there are some failed
-                  pods and operator is required to do some operation on the cluster.
-                  If pods in running state are defined in this list, they are not
-                  ignored.
+                description: IgnorePodList is a list of pods that the operator will
+                  ignore while assessing cluster stability. Pods specified in this
+                  list are not considered part of the cluster. This is particularly
+                  useful when there are failed pods and the operator needs to perform
+                  certain operations on the cluster. Note that running pods included
+                  in this list will not be ignored.
                 items:
                   type: string
                 type: array
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 5da007628..ea2d0b3cc 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -139,36 +139,15 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 			var (
 				aeroCluster *asdbv1.AerospikeCluster
 				err         error
-			)
-
-			BeforeEach(
-				func() {
-					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4)
-					racks := getDummyRackConf(1, 2)
-					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks}
-					err = deployCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-				},
-			)
-
-			AfterEach(
-				func() {
-					err = deleteCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-				},
-			)
 
-			It(
-				"Should allow cluster operations with failed pods", func() {
-					By("Fail 2-0 aerospike pod")
+				testClusterLifecycle = func(ignorePodName string) {
+					By(fmt.Sprintf("Fail %s aerospike pod", ignorePodName))
 					pod := &v1.Pod{}
-					ignorePodName := clusterNamespacedName.Name + "-2-0"
 
 					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
 						Namespace: clusterNamespacedName.Namespace}, pod)
 					Expect(err).ToNot(HaveOccurred())
 
-					// This will lead to pod 2-0 pod in failed state
 					pod.Spec.Containers[0].Image = "wrong-image"
 					err = k8sClient.Update(ctx, pod)
 					Expect(err).ToNot(HaveOccurred())
@@ -203,14 +182,15 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					By("Verify pod 2-0 is still in failed state")
+					By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName))
 					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
 						Namespace: clusterNamespacedName.Namespace}, pod)
 					Expect(err).ToNot(HaveOccurred())
 					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
 					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
 
-					By("Remove pod from IgnorePodList and verify pod 2-0 is in running state")
+					By(fmt.Sprintf(
+						"Remove pod from IgnorePodList and verify pod %s is in running state", ignorePodName))
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
 					aeroCluster.Spec.IgnorePodList = []string{}
@@ -223,6 +203,68 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeTrue())
 					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeTrue())
 					Expect(pod.Spec.Containers[0].Image).To(Equal(newImage))
+				}
+			)
+
+			BeforeEach(
+				func() {
+					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4)
+					racks := getDummyRackConf(1, 2)
+					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks}
+					err = deployCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+
+			AfterEach(
+				func() {
+					err = deleteCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+
+			It(
+				"Should allow cluster operations with random failed pod", func() {
+					// test with failed pod in between statefulset replicas
+					testClusterLifecycle(clusterNamespacedName.Name + "-2-0")
+				},
+			)
+
+			It(
+				"Should allow cluster operations with sequential(last replica) failed pod", func() {
+					// test with last replica of statefulset as failed pod
+					testClusterLifecycle(clusterNamespacedName.Name + "-1-1")
+				},
+			)
+
+			It(
+				"Should allow rack deletion with failed pods in different rack", func() {
+					By("Fail 1-1 aerospike pod")
+					ignorePodName := clusterNamespacedName.Name + "-1-1"
+					pod := &v1.Pod{}
+
+					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					pod.Spec.Containers[0].Image = "wrong-image"
+					err = k8sClient.Update(ctx, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Delete rack with id 2")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
+					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: getDummyRackConf(1)}
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName))
+					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
+					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
 				},
 			)
 		},

From a148ed83ca0cf17caa4b1f70e5a9c261acc33faf Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Tue, 28 Nov 2023 18:29:11 +0530
Subject: [PATCH 06/17] test

---
 api/v1/aerospikecluster_types.go              |  2 ++
 .../asdb.aerospike.com_aerospikeclusters.yaml |  4 ++++
 ...rnetes-operator.clusterserviceversion.yaml | 10 ++++----
 controllers/pod.go                            | 24 +++++++++++--------
 controllers/rack.go                           |  2 +-
 controllers/reconciler.go                     |  2 +-
 6 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index bcbb7cd37..156ed52e5 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -283,6 +283,8 @@ type RackConfig struct { //nolint:govet // for readability
 	// RollingUpdateBatchSize is the percentage/number of rack pods that will be restarted simultaneously
 	// +optional
 	RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
+
+	MaxUnavailable int `json:"maxUnavailable,omitempty"`
 }
 
 // Rack specifies single rack config
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index 6d64a64b9..e343dd23b 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -4603,6 +4603,8 @@ spec:
                   Aerospike cluster. Pods will be deployed in given racks based on
                   given configuration
                 properties:
+                  maxUnavailable:
+                    type: integer
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled
@@ -13350,6 +13352,8 @@ spec:
                   given configuration
                 nullable: true
                 properties:
+                  maxUnavailable:
+                    type: integer
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled
diff --git a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
index 2b8c3c42e..1a03a8841 100644
--- a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
@@ -47,11 +47,11 @@ spec:
           the Aerospike cluster.
         displayName: Aerospike Network Policy
         path: aerospikeNetworkPolicy
-      - description: IgnorePodList is the list of pods which are ignored by the operator
-          while checking the cluster stability and are not considered part of cluster.
-          This is only useful when there are some failed pods and operator is required
-          to do some operation on the cluster. If pods in running state are defined
-          in this list, they are not ignored.
+      - description: IgnorePodList is a list of pods that the operator will ignore
+          while assessing cluster stability. Pods specified in this list are not considered
+          part of the cluster. This is particularly useful when there are failed pods
+          and the operator needs to perform certain operations on the cluster. Note
+          that running pods included in this list will not be ignored.
         displayName: Ignore Pod List
         path: ignorePodList
       - description: Aerospike server image
diff --git a/controllers/pod.go b/controllers/pod.go
index 0cd28b466..1efd76345 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -650,7 +650,7 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 // getIgnorablePods returns pods:
 // 1. From racksToDelete that are currently not running and can be ignored in stability checks.
 // 2. User given pods in ignorePodList that are currently not running and can be ignored from stability checks.
-func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack) (
+func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configureRacks []RackState) (
 	sets.Set[string], error,
 ) {
 	ignorablePodNames := sets.Set[string]{}
@@ -669,17 +669,21 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack)
 		}
 	}
 
-	podList, err := r.getClusterPodList()
-	if err != nil {
-		return nil, err
-	}
+	for idx := range configureRacks {
+		rack := &configureRacks[idx]
+		failedAllowed := r.aeroCluster.Spec.RackConfig.MaxUnavailable
 
-	userIgnorePodSet := sets.NewString(r.aeroCluster.Spec.IgnorePodList...)
+		podList, err := r.getRackPodList(rack.Rack.ID)
+		if err != nil {
+			return nil, err
+		}
 
-	for podIdx := range podList.Items {
-		pod := &podList.Items[podIdx]
-		if userIgnorePodSet.Has(pod.Name) && !utils.IsPodRunningAndReady(pod) {
-			ignorablePodNames.Insert(pod.Name)
+		for podIdx := range podList.Items {
+			pod := &podList.Items[podIdx]
+			if !utils.IsPodRunningAndReady(pod) && failedAllowed > 0 {
+				ignorablePodNames.Insert(pod.Name)
+				failedAllowed--
+			}
 		}
 	}
 
diff --git a/controllers/rack.go b/controllers/rack.go
index 263a4e380..aa6f0670e 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -45,7 +45,7 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult {
 		rackIDsToDelete = append(rackIDsToDelete, racksToDelete[idx].ID)
 	}
 
-	ignorablePodNames, err := r.getIgnorablePods(racksToDelete)
+	ignorablePodNames, err := r.getIgnorablePods(racksToDelete, rackStateList)
 	if err != nil {
 		return reconcileError(err)
 	}
diff --git a/controllers/reconciler.go b/controllers/reconciler.go
index 796147f1a..7d97e0437 100644
--- a/controllers/reconciler.go
+++ b/controllers/reconciler.go
@@ -124,7 +124,7 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 		return reconcile.Result{}, err
 	}
 
-	ignorablePodNames, err := r.getIgnorablePods(nil)
+	ignorablePodNames, err := r.getIgnorablePods(nil, getConfiguredRackStateList(r.aeroCluster))
 	if err != nil {
 		r.Log.Error(err, "Failed to determine pods to be ignored")
 

From d0eab469cd2533699d39a6e01ab633983d375d33 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Wed, 29 Nov 2023 12:48:47 +0530
Subject: [PATCH 07/17] test

---
 api/v1/aerospikecluster_types.go              |  3 +-
 .../asdb.aerospike.com_aerospikeclusters.yaml |  4 +--
 controllers/pod.go                            | 31 ++++++++++++++++---
 go.mod                                        |  2 +-
 go.sum                                        |  2 ++
 pkg/utils/pod.go                              |  4 +--
 test/cluster_test.go                          |  2 +-
 7 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 156ed52e5..0664fd3f5 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -284,7 +284,8 @@ type RackConfig struct { //nolint:govet // for readability
 	// +optional
 	RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
 
-	MaxUnavailable int `json:"maxUnavailable,omitempty"`
+	// +optional
+	MaxIgnorableFailedPods int `json:"maxIgnorableFailedPods,omitempty"`
 }
 
 // Rack specifies single rack config
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index e343dd23b..552f56a7a 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -4603,7 +4603,7 @@ spec:
                   Aerospike cluster. Pods will be deployed in given racks based on
                   given configuration
                 properties:
-                  maxUnavailable:
+                  maxIgnorableFailedPods:
                     type: integer
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
@@ -13352,7 +13352,7 @@ spec:
                   given configuration
                 nullable: true
                 properties:
-                  maxUnavailable:
+                  maxIgnorableFailedPods:
                     type: integer
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
diff --git a/controllers/pod.go b/controllers/pod.go
index 1efd76345..474e42e65 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -671,25 +671,46 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
 
 	for idx := range configureRacks {
 		rack := &configureRacks[idx]
-		failedAllowed := r.aeroCluster.Spec.RackConfig.MaxUnavailable
+		failedAllowed := r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods
 
 		podList, err := r.getRackPodList(rack.Rack.ID)
 		if err != nil {
 			return nil, err
 		}
 
+		var (
+			failedPod  []string
+			pendingPod []string
+		)
+
 		for podIdx := range podList.Items {
 			pod := &podList.Items[podIdx]
-			if !utils.IsPodRunningAndReady(pod) && failedAllowed > 0 {
-				ignorablePodNames.Insert(pod.Name)
-				failedAllowed--
+
+			if !utils.IsPodRunningAndReady(pod) {
+				if utils.IsPodReasonUnschedulable(pod) {
+					pendingPod = append(pendingPod, pod.Name)
+					continue
+				}
+
+				failedPod = append(failedPod, pod.Name)
 			}
 		}
+
+		// prepend pendingPod to failedPod
+		failedPod = append(pendingPod, failedPod...)
+
+		for podIdx := range failedPod {
+			if failedAllowed <= 0 {
+				break
+			}
+
+			ignorablePodNames.Insert(failedPod[podIdx])
+			failedAllowed--
+		}
 	}
 
 	return ignorablePodNames, nil
 }
-
 func (r *SingleClusterReconciler) getPodsPVCList(
 	podNames []string, rackID int,
 ) ([]corev1.PersistentVolumeClaim, error) {
diff --git a/go.mod b/go.mod
index c6ee1681c..87eff97ac 100644
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,7 @@ go 1.19
 
 require (
 	github.com/aerospike/aerospike-client-go/v6 v6.14.0
-	github.com/aerospike/aerospike-management-lib v0.0.0-20231107182540-fef71e1f5946
+	github.com/aerospike/aerospike-management-lib v0.0.0-20231129055344-b6aff63f1dbb
 	github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d
 	github.com/evanphx/json-patch v4.12.0+incompatible
 	github.com/go-logr/logr v1.2.4
diff --git a/go.sum b/go.sum
index 7fb23f6a6..25c3d901b 100644
--- a/go.sum
+++ b/go.sum
@@ -600,6 +600,8 @@ github.com/aerospike/aerospike-client-go/v6 v6.14.0 h1:Z3FcGWJda1sagzdc6Akz4EJ13
 github.com/aerospike/aerospike-client-go/v6 v6.14.0/go.mod h1:/0Wm81GhMqem+9flWcpazPKoRfjFeG6WrQdXGiMNi0A=
 github.com/aerospike/aerospike-management-lib v0.0.0-20231107182540-fef71e1f5946 h1:wwCzPj4qk4EfdISK6tzNoEwSLg9vbeqBloNmhfB8mNo=
 github.com/aerospike/aerospike-management-lib v0.0.0-20231107182540-fef71e1f5946/go.mod h1:LPOsGG8okRSH4hN9Y8VXFzsfIpBDj2WKEsI/f6wxwaw=
+github.com/aerospike/aerospike-management-lib v0.0.0-20231129055344-b6aff63f1dbb h1:ykX3ElBNT/VOUhw/+5+jiFnWw3LSbPfl6eRrhQzBBFk=
+github.com/aerospike/aerospike-management-lib v0.0.0-20231129055344-b6aff63f1dbb/go.mod h1:LPOsGG8okRSH4hN9Y8VXFzsfIpBDj2WKEsI/f6wxwaw=
 github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY=
 github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk=
 github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go
index 75399f02b..b7c687dd8 100644
--- a/pkg/utils/pod.go
+++ b/pkg/utils/pod.go
@@ -32,7 +32,7 @@ func CheckPodFailed(pod *corev1.Pod) error {
 		return fmt.Errorf("pod %s has failed status", pod.Name)
 	}
 
-	if pod.Status.Phase == corev1.PodPending && isPodReasonUnschedulable(pod) {
+	if pod.Status.Phase == corev1.PodPending && IsPodReasonUnschedulable(pod) {
 		return fmt.Errorf("pod %s is in unschedulable state", pod.Name)
 	}
 
@@ -210,7 +210,7 @@ func isPodError(reason string) bool {
 	return strings.HasSuffix(reason, "Error")
 }
 
-func isPodReasonUnschedulable(pod *corev1.Pod) bool {
+func IsPodReasonUnschedulable(pod *corev1.Pod) bool {
 	for _, condition := range pod.Status.Conditions {
 		if condition.Type == corev1.PodScheduled && condition.Reason == corev1.PodReasonUnschedulable {
 			return true
diff --git a/test/cluster_test.go b/test/cluster_test.go
index ea2d0b3cc..6075fb963 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -170,7 +170,7 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					By("Upgrade version")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					newImage := baseImage + ":6.4.0.4"
+					newImage := baseImage + ":7.0.0.0_2"
 					aeroCluster.Spec.Image = newImage
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())

From f33cbb1da8d69fac3ef5cdbb7647e5ef49b8cee3 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Wed, 29 Nov 2023 13:40:53 +0530
Subject: [PATCH 08/17] Changed MaxIgnorableFailedPods field type to
 IntOrString

---
 api/v1/aerospikecluster_types.go              |  7 ++-
 api/v1/aerospikecluster_validating_webhook.go | 44 ++++++++++++-------
 api/v1/zz_generated.deepcopy.go               |  5 +++
 .../asdb.aerospike.com_aerospikeclusters.yaml | 22 +++++++++-
 controllers/pod.go                            |  9 +++-
 ..._aerospikeclusters.asdb.aerospike.com.yaml | 22 ++++++++++
 6 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 0664fd3f5..1a04b46aa 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -283,9 +283,12 @@ type RackConfig struct { //nolint:govet // for readability
 	// RollingUpdateBatchSize is the percentage/number of rack pods that will be restarted simultaneously
 	// +optional
 	RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
-
+	// MaxIgnorableFailedPods is the maximum percentage/number of rack failed pods that will be ignored while assessing
+	// cluster stability. Failed pods identified using this value are not considered part of the cluster.
+	// This is particularly useful when there are failed pods and the operator needs to perform certain operations on
+	// the cluster.
 	// +optional
-	MaxIgnorableFailedPods int `json:"maxIgnorableFailedPods,omitempty"`
+	MaxIgnorableFailedPods *intstr.IntOrString `json:"maxIgnorableFailedPods,omitempty"`
 }
 
 // Rack specifies single rack config
diff --git a/api/v1/aerospikecluster_validating_webhook.go b/api/v1/aerospikecluster_validating_webhook.go
index e98c12f8a..71c14f2d6 100644
--- a/api/v1/aerospikecluster_validating_webhook.go
+++ b/api/v1/aerospikecluster_validating_webhook.go
@@ -605,24 +605,11 @@ func (c *AerospikeCluster) validateRackConfig(_ logr.Logger) error {
 
 	// Validate batch upgrade/restart param
 	if c.Spec.RackConfig.RollingUpdateBatchSize != nil {
-		// Just validate if RollingUpdateBatchSize is valid number or string.
-		randomNumber := 100
-
-		count, err := intstr.GetScaledValueFromIntOrPercent(
-			c.Spec.RackConfig.RollingUpdateBatchSize, randomNumber, false,
-		)
-		if err != nil {
+		if err := validateIntOrStringField(c.Spec.RackConfig.RollingUpdateBatchSize,
+			"spec.rackConfig.rollingUpdateBatchSize"); err != nil {
 			return err
 		}
 
-		// Only negative is not allowed. Any big number can be given.
-		if count < 0 {
-			return fmt.Errorf(
-				"can not use negative rackConfig.RollingUpdateBatchSize  %s",
-				c.Spec.RackConfig.RollingUpdateBatchSize.String(),
-			)
-		}
-
 		if len(c.Spec.RackConfig.Racks) < 2 {
 			return fmt.Errorf("can not use rackConfig.RollingUpdateBatchSize when number of racks is less than two")
 		}
@@ -651,6 +638,13 @@ func (c *AerospikeCluster) validateRackConfig(_ logr.Logger) error {
 		}
 	}
 
+	// Validate MaxIgnorableFailedPods param
+	if c.Spec.RackConfig.MaxIgnorableFailedPods != nil {
+		if err := validateIntOrStringField(c.Spec.RackConfig.MaxIgnorableFailedPods,
+			"spec.rackConfig.maxIgnorableFailedPods"); err != nil {
+			return err
+		}
+	}
 	// TODO: should not use batch if racks are less than replication-factor
 	return nil
 }
@@ -2176,3 +2170,23 @@ func (c *AerospikeCluster) validateNetworkPolicy(namespace string) error {
 
 	return nil
 }
+
+func validateIntOrStringField(value *intstr.IntOrString, fieldPath string) error {
+	randomNumber := 100
+	// Just validate if value is valid number or string.
+	count, err := intstr.GetScaledValueFromIntOrPercent(value, randomNumber, false)
+	if err != nil {
+		return err
+	}
+
+	// Only negative is not allowed. Any big number can be given.
+	if count < 0 {
+		return fmt.Errorf("can not use negative %s: %s", fieldPath, value.String())
+	}
+
+	if value.Type == intstr.String && count > 100 {
+		return fmt.Errorf("%s: %s must not be greater than 100 percent", fieldPath, value.String())
+	}
+
+	return nil
+}
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index b1ba3adde..bbbf11e1f 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -858,6 +858,11 @@ func (in *RackConfig) DeepCopyInto(out *RackConfig) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
+	if in.MaxIgnorableFailedPods != nil {
+		in, out := &in.MaxIgnorableFailedPods, &out.MaxIgnorableFailedPods
+		*out = new(intstr.IntOrString)
+		**out = **in
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RackConfig.
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index 552f56a7a..78842d978 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -4604,7 +4604,16 @@ spec:
                   given configuration
                 properties:
                   maxIgnorableFailedPods:
-                    type: integer
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxIgnorableFailedPods is the maximum percentage/number
+                      of rack failed pods that will be ignored while assessing cluster
+                      stability. Failed pods identified using this value are not considered
+                      part of the cluster. This is particularly useful when there
+                      are failed pods and the operator needs to perform certain operations
+                      on the cluster.
+                    x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled
@@ -13353,7 +13362,16 @@ spec:
                 nullable: true
                 properties:
                   maxIgnorableFailedPods:
-                    type: integer
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxIgnorableFailedPods is the maximum percentage/number
+                      of rack failed pods that will be ignored while assessing cluster
+                      stability. Failed pods identified using this value are not considered
+                      part of the cluster. This is particularly useful when there
+                      are failed pods and the operator needs to perform certain operations
+                      on the cluster.
+                    x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled
diff --git a/controllers/pod.go b/controllers/pod.go
index 474e42e65..46f616ceb 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -13,6 +13,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
@@ -671,7 +672,13 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
 
 	for idx := range configureRacks {
 		rack := &configureRacks[idx]
-		failedAllowed := r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods
+
+		failedAllowed, err := intstr.GetScaledValueFromIntOrPercent(
+			r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods, rack.Size, false,
+		)
+		if err != nil {
+			return nil, err
+		}
 
 		podList, err := r.getRackPodList(rack.Rack.ID)
 		if err != nil {
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index 6d64a64b9..78842d978 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -4603,6 +4603,17 @@ spec:
                   Aerospike cluster. Pods will be deployed in given racks based on
                   given configuration
                 properties:
+                  maxIgnorableFailedPods:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxIgnorableFailedPods is the maximum percentage/number
+                      of rack failed pods that will be ignored while assessing cluster
+                      stability. Failed pods identified using this value are not considered
+                      part of the cluster. This is particularly useful when there
+                      are failed pods and the operator needs to perform certain operations
+                      on the cluster.
+                    x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled
@@ -13350,6 +13361,17 @@ spec:
                   given configuration
                 nullable: true
                 properties:
+                  maxIgnorableFailedPods:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: MaxIgnorableFailedPods is the maximum percentage/number
+                      of rack failed pods that will be ignored while assessing cluster
+                      stability. Failed pods identified using this value are not considered
+                      part of the cluster. This is particularly useful when there
+                      are failed pods and the operator needs to perform certain operations
+                      on the cluster.
+                    x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
                       will be enabled

From 7c274415ca10505fe03dd00a55e566d24bf3fefd Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Wed, 29 Nov 2023 13:45:33 +0530
Subject: [PATCH 09/17] Handled nil value

---
 controllers/pod.go | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/controllers/pod.go b/controllers/pod.go
index 46f616ceb..4a64fb304 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -673,12 +673,9 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
 	for idx := range configureRacks {
 		rack := &configureRacks[idx]
 
-		failedAllowed, err := intstr.GetScaledValueFromIntOrPercent(
+		failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
 			r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods, rack.Size, false,
 		)
-		if err != nil {
-			return nil, err
-		}
 
 		podList, err := r.getRackPodList(rack.Rack.ID)
 		if err != nil {

From 5142aa3f9f677a2f219289e145c38dae996d229c Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Thu, 30 Nov 2023 03:14:02 +0530
Subject: [PATCH 10/17] Handled device removal for ignored pods

---
 api/v1/aerospikecluster_types.go              | 38 +++----------------
 api/v1/zz_generated.deepcopy.go               |  5 ---
 .../asdb.aerospike.com_aerospikeclusters.yaml | 36 ++++++++----------
 controllers/rack.go                           | 24 ++++++++++++
 controllers/reconciler.go                     | 37 +++++++++++++++++-
 ..._aerospikeclusters.asdb.aerospike.com.yaml | 36 ++++++++----------
 6 files changed, 97 insertions(+), 79 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 1a04b46aa..17e19f2bd 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -74,12 +74,6 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
 	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster Node BlockList"
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
-	// IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability.
-	// Pods specified in this list are not considered part of the cluster. This is particularly useful when
-	// there are failed pods and the operator needs to perform certain operations on the cluster. Note that
-	// running pods included in this list will not be ignored.
-	// +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Ignore Pod List"
-	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
 type SeedsFinderServices struct {
@@ -283,10 +277,12 @@ type RackConfig struct { //nolint:govet // for readability
 	// RollingUpdateBatchSize is the percentage/number of rack pods that will be restarted simultaneously
 	// +optional
 	RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
-	// MaxIgnorableFailedPods is the maximum percentage/number of rack failed pods that will be ignored while assessing
-	// cluster stability. Failed pods identified using this value are not considered part of the cluster.
-	// This is particularly useful when there are failed pods and the operator needs to perform certain operations on
-	// the cluster.
+	// MaxIgnorableFailedPods is the maximum percentage/number of rack pods that are in pending state due to scheduling
+	// issues. They are ignored while assessing cluster stability. Failed/pending pods identified using this value are
+	// not considered part of the cluster.
+	// This is particularly useful when there are failed/pending pods that cannot be recovered by updating the CR and
+	// the operator needs to perform certain operations on the cluster like Aerospike config change.
+	// Reset this value to 0 after the deployment is done, to avoid unintended consequences.
 	// +optional
 	MaxIgnorableFailedPods *intstr.IntOrString `json:"maxIgnorableFailedPods,omitempty"`
 }
@@ -955,17 +951,6 @@ func CopySpecToStatus(spec *AerospikeClusterSpec) (*AerospikeClusterStatusSpec,
 		status.RosterNodeBlockList = rosterNodeBlockList
 	}
 
-	// IgnorePodList
-	if len(spec.IgnorePodList) != 0 {
-		var ignorePodList []string
-
-		lib.DeepCopy(
-			&ignorePodList, &spec.IgnorePodList,
-		)
-
-		status.IgnorePodList = ignorePodList
-	}
-
 	return &status, nil
 }
 
@@ -1056,16 +1041,5 @@ func CopyStatusToSpec(status *AerospikeClusterStatusSpec) (*AerospikeClusterSpec
 		spec.RosterNodeBlockList = rosterNodeBlockList
 	}
 
-	// IgnorePodList
-	if len(status.IgnorePodList) != 0 {
-		var ignorePodList []string
-
-		lib.DeepCopy(
-			&ignorePodList, &status.IgnorePodList,
-		)
-
-		spec.IgnorePodList = ignorePodList
-	}
-
 	return &spec, nil
 }
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index bbbf11e1f..892f93d8b 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -183,11 +183,6 @@ func (in *AerospikeClusterSpec) DeepCopyInto(out *AerospikeClusterSpec) {
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
-	if in.IgnorePodList != nil {
-		in, out := &in.IgnorePodList, &out.IgnorePodList
-		*out = make([]string, len(*in))
-		copy(*out, *in)
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AerospikeClusterSpec.
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index 78842d978..dda91cbf1 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -277,16 +277,6 @@ spec:
                     - customInterface
                     type: string
                 type: object
-              ignorePodList:
-                description: IgnorePodList is a list of pods that the operator will
-                  ignore while assessing cluster stability. Pods specified in this
-                  list are not considered part of the cluster. This is particularly
-                  useful when there are failed pods and the operator needs to perform
-                  certain operations on the cluster. Note that running pods included
-                  in this list will not be ignored.
-                items:
-                  type: string
-                type: array
               image:
                 description: Aerospike server image
                 type: string
@@ -4608,11 +4598,14 @@ spec:
                     - type: integer
                     - type: string
                     description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack failed pods that will be ignored while assessing cluster
-                      stability. Failed pods identified using this value are not considered
-                      part of the cluster. This is particularly useful when there
-                      are failed pods and the operator needs to perform certain operations
-                      on the cluster.
+                      of rack pods that are in pending state due to scheduling issues.
+                      They are ignored while assessing cluster stability. Failed/pending
+                      pods identified using this value are not considered part of
+                      the cluster. This is particularly useful when there are failed/pending
+                      pods that cannot be recovered by updating the CR and the operator
+                      needs to perform certain operations on the cluster like Aerospike
+                      config change. Reset this value to 0 after the deployment is
+                      done, to avoid unintended consequences.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
@@ -13366,11 +13359,14 @@ spec:
                     - type: integer
                     - type: string
                     description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack failed pods that will be ignored while assessing cluster
-                      stability. Failed pods identified using this value are not considered
-                      part of the cluster. This is particularly useful when there
-                      are failed pods and the operator needs to perform certain operations
-                      on the cluster.
+                      of rack pods that are in pending state due to scheduling issues.
+                      They are ignored while assessing cluster stability. Failed/pending
+                      pods identified using this value are not considered part of
+                      the cluster. This is particularly useful when there are failed/pending
+                      pods that cannot be recovered by updating the CR and the operator
+                      needs to perform certain operations on the cluster like Aerospike
+                      config change. Reset this value to 0 after the deployment is
+                      done, to avoid unintended consequences.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
diff --git a/controllers/rack.go b/controllers/rack.go
index aa6f0670e..d2b6d5703 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -419,6 +419,30 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 		}
 	}
 
+	podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
+	if err != nil {
+		return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
+	}
+
+	// Filter ignoredPods to update their dirtyVolumes in the status.
+	// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
+	// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
+	// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
+	var ignoredPod []*corev1.Pod
+
+	for idx := range podList {
+		pod := podList[idx]
+		if ignorablePodNames.Has(pod.Name) {
+			ignoredPod = append(ignoredPod, pod)
+		}
+	}
+
+	if len(ignoredPod) > 0 {
+		if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
+			return found, reconcileError(err)
+		}
+	}
+
 	return found, reconcileSuccess()
 }
 
diff --git a/controllers/reconciler.go b/controllers/reconciler.go
index 7d97e0437..bc8fbf3fd 100644
--- a/controllers/reconciler.go
+++ b/controllers/reconciler.go
@@ -179,8 +179,8 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 	// Use policy from spec after setting up access control
 	policy := r.getClientPolicy()
 
-	// revert migrate-fill-delay to original value if it was set to 0 during scale down
-	// Passing first rack from the list as all the racks will have same migrate-fill-delay
+	// Revert migrate-fill-delay to original value if it was set to 0 during scale down.
+	// Passing the first rack from the list as all the racks will have the same migrate-fill-delay
 	// Redundant safe check to revert migrate-fill-delay if previous revert operation missed/skipped somehow
 	if res := r.setMigrateFillDelay(
 		policy, &r.aeroCluster.Spec.RackConfig.Racks[0].AerospikeConfig,
@@ -216,6 +216,39 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 		return reconcile.Result{}, err
 	}
 
+	podList, gErr := r.getClusterPodList()
+	if gErr != nil {
+		r.Log.Error(gErr, "Failed to get cluster pod list")
+		return reconcile.Result{}, gErr
+	}
+
+	r.Log.Info("Try to recover failed/pending pods if any")
+
+	var anyPodFailed bool
+	// Try to recover failed/pending pods by deleting them
+	for idx := range podList.Items {
+		if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
+			anyPodFailed = true
+
+			if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
+				return reconcile.Result{}, err
+			}
+
+			if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
+				r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
+				return reconcile.Result{}, err
+			}
+
+			r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
+		}
+	}
+
+	if anyPodFailed {
+		return reconcile.Result{Requeue: true}, nil
+	}
+
+	r.Log.Info("Reconcile completed successfully")
+
 	return reconcile.Result{}, nil
 }
 
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index 78842d978..dda91cbf1 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -277,16 +277,6 @@ spec:
                     - customInterface
                     type: string
                 type: object
-              ignorePodList:
-                description: IgnorePodList is a list of pods that the operator will
-                  ignore while assessing cluster stability. Pods specified in this
-                  list are not considered part of the cluster. This is particularly
-                  useful when there are failed pods and the operator needs to perform
-                  certain operations on the cluster. Note that running pods included
-                  in this list will not be ignored.
-                items:
-                  type: string
-                type: array
               image:
                 description: Aerospike server image
                 type: string
@@ -4608,11 +4598,14 @@ spec:
                     - type: integer
                     - type: string
                     description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack failed pods that will be ignored while assessing cluster
-                      stability. Failed pods identified using this value are not considered
-                      part of the cluster. This is particularly useful when there
-                      are failed pods and the operator needs to perform certain operations
-                      on the cluster.
+                      of rack pods that are in pending state due to scheduling issues.
+                      They are ignored while assessing cluster stability. Failed/pending
+                      pods identified using this value are not considered part of
+                      the cluster. This is particularly useful when there are failed/pending
+                      pods that cannot be recovered by updating the CR and the operator
+                      needs to perform certain operations on the cluster like Aerospike
+                      config change. Reset this value to 0 after the deployment is
+                      done, to avoid unintended consequences.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
@@ -13366,11 +13359,14 @@ spec:
                     - type: integer
                     - type: string
                     description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack failed pods that will be ignored while assessing cluster
-                      stability. Failed pods identified using this value are not considered
-                      part of the cluster. This is particularly useful when there
-                      are failed pods and the operator needs to perform certain operations
-                      on the cluster.
+                      of rack pods that are in pending state due to scheduling issues.
+                      They are ignored while assessing cluster stability. Failed/pending
+                      pods identified using this value are not considered part of
+                      the cluster. This is particularly useful when there are failed/pending
+                      pods that cannot be recovered by updating the CR and the operator
+                      needs to perform certain operations on the cluster like Aerospike
+                      config change. Reset this value to 0 after the deployment is
+                      done, to avoid unintended consequences.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature

From 64a0c57fc448ad1c828783c75eeeafeeb28f6f8b Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Thu, 30 Nov 2023 09:58:14 +0530
Subject: [PATCH 11/17] Fixed go-lint

---
 api/v1/aerospikecluster_types.go                       |  5 -----
 api/v1/zz_generated.deepcopy.go                        |  5 -----
 .../bases/asdb.aerospike.com_aerospikeclusters.yaml    | 10 ----------
 ...efinition_aerospikeclusters.asdb.aerospike.com.yaml | 10 ----------
 test/cluster_test.go                                   |  9 ++++++---
 5 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 17e19f2bd..60ef8f55b 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -624,11 +624,6 @@ type AerospikeClusterStatusSpec struct { //nolint:govet // for readability
 	SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"`
 	// RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup
 	RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"`
-	// IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability.
-	// Pods specified in this list are not considered part of the cluster. This is particularly useful when
-	// there are failed pods and the operator needs to perform certain operations on the cluster. Note that
-	// running pods included in this list will not be ignored.
-	IgnorePodList []string `json:"ignorePodList,omitempty"`
 }
 
 // AerospikeClusterStatus defines the observed state of AerospikeCluster
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 892f93d8b..76d5c6591 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -255,11 +255,6 @@ func (in *AerospikeClusterStatusSpec) DeepCopyInto(out *AerospikeClusterStatusSp
 		*out = make([]string, len(*in))
 		copy(*out, *in)
 	}
-	if in.IgnorePodList != nil {
-		in, out := &in.IgnorePodList, &out.IgnorePodList
-		*out = make([]string, len(*in))
-		copy(*out, *in)
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AerospikeClusterStatusSpec.
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index dda91cbf1..ca804bf58 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -8900,16 +8900,6 @@ spec:
                     - customInterface
                     type: string
                 type: object
-              ignorePodList:
-                description: IgnorePodList is a list of pods that the operator will
-                  ignore while assessing cluster stability. Pods specified in this
-                  list are not considered part of the cluster. This is particularly
-                  useful when there are failed pods and the operator needs to perform
-                  certain operations on the cluster. Note that running pods included
-                  in this list will not be ignored.
-                items:
-                  type: string
-                type: array
               image:
                 description: Aerospike server image
                 type: string
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index dda91cbf1..ca804bf58 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -8900,16 +8900,6 @@ spec:
                     - customInterface
                     type: string
                 type: object
-              ignorePodList:
-                description: IgnorePodList is a list of pods that the operator will
-                  ignore while assessing cluster stability. Pods specified in this
-                  list are not considered part of the cluster. This is particularly
-                  useful when there are failed pods and the operator needs to perform
-                  certain operations on the cluster. Note that running pods included
-                  in this list will not be ignored.
-                items:
-                  type: string
-                type: array
               image:
                 description: Aerospike server image
                 type: string
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 6075fb963..57ee9e823 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -9,6 +9,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 
 	asdbv1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1"
 )
@@ -155,7 +156,8 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					By("Set IgnorePodList and scale down 1 pod")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
+					val := intstr.FromInt(1)
+					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = &val
 					aeroCluster.Spec.Size--
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
@@ -193,7 +195,7 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 						"Remove pod from IgnorePodList and verify pod %s is in running state", ignorePodName))
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.IgnorePodList = []string{}
+					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = nil
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
@@ -254,7 +256,8 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					By("Delete rack with id 2")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.IgnorePodList = []string{ignorePodName}
+					val := intstr.FromInt(1)
+					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = &val
 					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: getDummyRackConf(1)}
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())

From 5850431e0a2fd8e115961ee967cd1c5d802aaaa9 Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Thu, 30 Nov 2023 21:28:15 +0530
Subject: [PATCH 12/17] Added/modified test-cases

---
 api/v1/aerospikecluster_types.go              |  17 +-
 api/v1/aerospikecluster_validating_webhook.go |   8 +-
 api/v1/zz_generated.deepcopy.go               |   4 +-
 .../asdb.aerospike.com_aerospikeclusters.yaml |  48 ++--
 ...rnetes-operator.clusterserviceversion.yaml |   7 -
 controllers/pod.go                            |   4 +-
 controllers/rack.go                           |  37 ++--
 controllers/reconciler.go                     |  48 ++--
 ..._aerospikeclusters.asdb.aerospike.com.yaml |  48 ++--
 test/aero_info.go                             |  34 +++
 test/cluster_helper.go                        |  22 ++
 test/cluster_test.go                          | 205 +++++++++++++-----
 12 files changed, 320 insertions(+), 162 deletions(-)

diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go
index 60ef8f55b..9346610f5 100644
--- a/api/v1/aerospikecluster_types.go
+++ b/api/v1/aerospikecluster_types.go
@@ -277,14 +277,17 @@ type RackConfig struct { //nolint:govet // for readability
 	// RollingUpdateBatchSize is the percentage/number of rack pods that will be restarted simultaneously
 	// +optional
 	RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
-	// MaxIgnorableFailedPods is the maximum percentage/number of rack pods that are in pending state due to scheduling
-	// issues. They are ignored while assessing cluster stability. Failed/pending pods identified using this value are
-	// not considered part of the cluster.
-	// This is particularly useful when there are failed/pending pods that cannot be recovered by updating the CR and
-	// the operator needs to perform certain operations on the cluster like Aerospike config change.
-	// Reset this value to 0 after the deployment is done, to avoid unintended consequences.
+	// MaxIgnorablePods is the maximum number/percentage of pending/failed pods in a rack that are ignored while
+	// assessing cluster stability. Pods identified using this value are not considered part of the cluster.
+	// Additionally, in SC mode clusters, these pods are removed from the roster.
+	// This is particularly useful when some pods are stuck in pending/failed state due to any scheduling issues and
+	// cannot be fixed by simply updating the CR.
+	// It enables the operator to perform specific operations on the cluster, like changing Aerospike configurations,
+	// without being hindered by these problematic pods.
+	// Remember to set MaxIgnorablePods back to 0 once the required operation is done.
+	// This makes sure that later on, all pods are properly counted when evaluating the cluster stability.
 	// +optional
-	MaxIgnorableFailedPods *intstr.IntOrString `json:"maxIgnorableFailedPods,omitempty"`
+	MaxIgnorablePods *intstr.IntOrString `json:"maxIgnorablePods,omitempty"`
 }
 
 // Rack specifies single rack config
diff --git a/api/v1/aerospikecluster_validating_webhook.go b/api/v1/aerospikecluster_validating_webhook.go
index 71c14f2d6..96d8e9454 100644
--- a/api/v1/aerospikecluster_validating_webhook.go
+++ b/api/v1/aerospikecluster_validating_webhook.go
@@ -638,10 +638,10 @@ func (c *AerospikeCluster) validateRackConfig(_ logr.Logger) error {
 		}
 	}
 
-	// Validate MaxIgnorableFailedPods param
-	if c.Spec.RackConfig.MaxIgnorableFailedPods != nil {
-		if err := validateIntOrStringField(c.Spec.RackConfig.MaxIgnorableFailedPods,
-			"spec.rackConfig.maxIgnorableFailedPods"); err != nil {
+	// Validate MaxIgnorablePods param
+	if c.Spec.RackConfig.MaxIgnorablePods != nil {
+		if err := validateIntOrStringField(c.Spec.RackConfig.MaxIgnorablePods,
+			"spec.rackConfig.maxIgnorablePods"); err != nil {
 			return err
 		}
 	}
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 76d5c6591..6696bf022 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -848,8 +848,8 @@ func (in *RackConfig) DeepCopyInto(out *RackConfig) {
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
-	if in.MaxIgnorableFailedPods != nil {
-		in, out := &in.MaxIgnorableFailedPods, &out.MaxIgnorableFailedPods
+	if in.MaxIgnorablePods != nil {
+		in, out := &in.MaxIgnorablePods, &out.MaxIgnorablePods
 		*out = new(intstr.IntOrString)
 		**out = **in
 	}
diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
index ca804bf58..266989bf2 100644
--- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
+++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
@@ -4593,19 +4593,23 @@ spec:
                   Aerospike cluster. Pods will be deployed in given racks based on
                   given configuration
                 properties:
-                  maxIgnorableFailedPods:
+                  maxIgnorablePods:
                     anyOf:
                     - type: integer
                     - type: string
-                    description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack pods that are in pending state due to scheduling issues.
-                      They are ignored while assessing cluster stability. Failed/pending
-                      pods identified using this value are not considered part of
-                      the cluster. This is particularly useful when there are failed/pending
-                      pods that cannot be recovered by updating the CR and the operator
-                      needs to perform certain operations on the cluster like Aerospike
-                      config change. Reset this value to 0 after the deployment is
-                      done, to avoid unintended consequences.
+                    description: MaxIgnorablePods is the maximum number/percentage
+                      of pending/failed pods in a rack that are ignored while assessing
+                      cluster stability. Pods identified using this value are not
+                      considered part of the cluster. Additionally, in SC mode clusters,
+                      these pods are removed from the roster. This is particularly
+                      useful when some pods are stuck in pending/failed state due
+                      to any scheduling issues and cannot be fixed by simply updating
+                      the CR. It enables the operator to perform specific operations
+                      on the cluster, like changing Aerospike configurations, without
+                      being hindered by these problematic pods. Remember to set MaxIgnorablePods
+                      back to 0 once the required operation is done. This makes sure
+                      that later on, all pods are properly counted when evaluating
+                      the cluster stability.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
@@ -13344,19 +13348,23 @@ spec:
                   given configuration
                 nullable: true
                 properties:
-                  maxIgnorableFailedPods:
+                  maxIgnorablePods:
                     anyOf:
                     - type: integer
                     - type: string
-                    description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack pods that are in pending state due to scheduling issues.
-                      They are ignored while assessing cluster stability. Failed/pending
-                      pods identified using this value are not considered part of
-                      the cluster. This is particularly useful when there are failed/pending
-                      pods that cannot be recovered by updating the CR and the operator
-                      needs to perform certain operations on the cluster like Aerospike
-                      config change. Reset this value to 0 after the deployment is
-                      done, to avoid unintended consequences.
+                    description: MaxIgnorablePods is the maximum number/percentage
+                      of pending/failed pods in a rack that are ignored while assessing
+                      cluster stability. Pods identified using this value are not
+                      considered part of the cluster. Additionally, in SC mode clusters,
+                      these pods are removed from the roster. This is particularly
+                      useful when some pods are stuck in pending/failed state due
+                      to any scheduling issues and cannot be fixed by simply updating
+                      the CR. It enables the operator to perform specific operations
+                      on the cluster, like changing Aerospike configurations, without
+                      being hindered by these problematic pods. Remember to set MaxIgnorablePods
+                      back to 0 once the required operation is done. This makes sure
+                      that later on, all pods are properly counted when evaluating
+                      the cluster stability.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
diff --git a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
index 1a03a8841..ed26895a5 100644
--- a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml
@@ -47,13 +47,6 @@ spec:
           the Aerospike cluster.
         displayName: Aerospike Network Policy
         path: aerospikeNetworkPolicy
-      - description: IgnorePodList is a list of pods that the operator will ignore
-          while assessing cluster stability. Pods specified in this list are not considered
-          part of the cluster. This is particularly useful when there are failed pods
-          and the operator needs to perform certain operations on the cluster. Note
-          that running pods included in this list will not be ignored.
-        displayName: Ignore Pod List
-        path: ignorePodList
       - description: Aerospike server image
         displayName: Server Image
         path: image
diff --git a/controllers/pod.go b/controllers/pod.go
index 4a64fb304..3fb2f840e 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -650,7 +650,7 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 
 // getIgnorablePods returns pods:
 // 1. From racksToDelete that are currently not running and can be ignored in stability checks.
-// 2. User given pods in ignorePodList that are currently not running and can be ignored from stability checks.
+// 2. Failed/pending pods identified using maxIgnorablePods field and can be ignored from stability checks.
 func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configureRacks []RackState) (
 	sets.Set[string], error,
 ) {
@@ -674,7 +674,7 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
 		rack := &configureRacks[idx]
 
 		failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
-			r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods, rack.Size, false,
+			r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, rack.Size, false,
 		)
 
 		podList, err := r.getRackPodList(rack.Rack.ID)
diff --git a/controllers/rack.go b/controllers/rack.go
index d2b6d5703..e03029ec0 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -419,27 +419,28 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 		}
 	}
 
-	podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
-	if err != nil {
-		return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
-	}
-
-	// Filter ignoredPods to update their dirtyVolumes in the status.
-	// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
-	// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
-	// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
-	var ignoredPod []*corev1.Pod
+	if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
+		podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
+		if err != nil {
+			return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
+		}
+		// Filter ignoredPods to update their dirtyVolumes in the status.
+		// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
+		// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
+		// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
+		var ignoredPod []*corev1.Pod
 
-	for idx := range podList {
-		pod := podList[idx]
-		if ignorablePodNames.Has(pod.Name) {
-			ignoredPod = append(ignoredPod, pod)
+		for idx := range podList {
+			pod := podList[idx]
+			if ignorablePodNames.Has(pod.Name) {
+				ignoredPod = append(ignoredPod, pod)
+			}
 		}
-	}
 
-	if len(ignoredPod) > 0 {
-		if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
-			return found, reconcileError(err)
+		if len(ignoredPod) > 0 {
+			if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
+				return found, reconcileError(err)
+			}
 		}
 	}
 
diff --git a/controllers/reconciler.go b/controllers/reconciler.go
index bc8fbf3fd..4b1df97a9 100644
--- a/controllers/reconciler.go
+++ b/controllers/reconciler.go
@@ -216,35 +216,39 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 		return reconcile.Result{}, err
 	}
 
-	podList, gErr := r.getClusterPodList()
-	if gErr != nil {
-		r.Log.Error(gErr, "Failed to get cluster pod list")
-		return reconcile.Result{}, gErr
-	}
+	// Try to recover pods only when MaxIgnorablePods is set
+	if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
+		podList, gErr := r.getClusterPodList()
+		if gErr != nil {
+			r.Log.Error(gErr, "Failed to get cluster pod list")
+			return reconcile.Result{}, gErr
+		}
 
-	r.Log.Info("Try to recover failed/pending pods if any")
+		r.Log.Info("Try to recover failed/pending pods if any")
 
-	var anyPodFailed bool
-	// Try to recover failed/pending pods by deleting them
-	for idx := range podList.Items {
-		if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
-			anyPodFailed = true
+		var anyPodFailed bool
+		// Try to recover failed/pending pods by deleting them
+		for idx := range podList.Items {
+			if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
+				anyPodFailed = true
 
-			if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
-				return reconcile.Result{}, err
-			}
+				if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
+					return reconcile.Result{}, err
+				}
 
-			if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
-				r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
-				return reconcile.Result{}, err
-			}
+				if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
+					r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
+					return reconcile.Result{}, err
+				}
 
-			r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
+				r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
+			}
 		}
-	}
 
-	if anyPodFailed {
-		return reconcile.Result{Requeue: true}, nil
+		if anyPodFailed {
+			r.Log.Info("Found failed/pending pod(s), requeuing")
+			return reconcile.Result{Requeue: true}, nil
+		}
 	}
 
 	r.Log.Info("Reconcile completed successfully")
diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
index ca804bf58..266989bf2 100644
--- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
+++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml
@@ -4593,19 +4593,23 @@ spec:
                   Aerospike cluster. Pods will be deployed in given racks based on
                   given configuration
                 properties:
-                  maxIgnorableFailedPods:
+                  maxIgnorablePods:
                     anyOf:
                     - type: integer
                     - type: string
-                    description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack pods that are in pending state due to scheduling issues.
-                      They are ignored while assessing cluster stability. Failed/pending
-                      pods identified using this value are not considered part of
-                      the cluster. This is particularly useful when there are failed/pending
-                      pods that cannot be recovered by updating the CR and the operator
-                      needs to perform certain operations on the cluster like Aerospike
-                      config change. Reset this value to 0 after the deployment is
-                      done, to avoid unintended consequences.
+                    description: MaxIgnorablePods is the maximum number/percentage
+                      of pending/failed pods in a rack that are ignored while assessing
+                      cluster stability. Pods identified using this value are not
+                      considered part of the cluster. Additionally, in SC mode clusters,
+                      these pods are removed from the roster. This is particularly
+                      useful when some pods are stuck in pending/failed state due
+                      to any scheduling issues and cannot be fixed by simply updating
+                      the CR. It enables the operator to perform specific operations
+                      on the cluster, like changing Aerospike configurations, without
+                      being hindered by these problematic pods. Remember to set MaxIgnorablePods
+                      back to 0 once the required operation is done. This makes sure
+                      that later on, all pods are properly counted when evaluating
+                      the cluster stability.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
@@ -13344,19 +13348,23 @@ spec:
                   given configuration
                 nullable: true
                 properties:
-                  maxIgnorableFailedPods:
+                  maxIgnorablePods:
                     anyOf:
                     - type: integer
                     - type: string
-                    description: MaxIgnorableFailedPods is the maximum percentage/number
-                      of rack pods that are in pending state due to scheduling issues.
-                      They are ignored while assessing cluster stability. Failed/pending
-                      pods identified using this value are not considered part of
-                      the cluster. This is particularly useful when there are failed/pending
-                      pods that cannot be recovered by updating the CR and the operator
-                      needs to perform certain operations on the cluster like Aerospike
-                      config change. Reset this value to 0 after the deployment is
-                      done, to avoid unintended consequences.
+                    description: MaxIgnorablePods is the maximum number/percentage
+                      of pending/failed pods in a rack that are ignored while assessing
+                      cluster stability. Pods identified using this value are not
+                      considered part of the cluster. Additionally, in SC mode clusters,
+                      these pods are removed from the roster. This is particularly
+                      useful when some pods are stuck in pending/failed state due
+                      to any scheduling issues and cannot be fixed by simply updating
+                      the CR. It enables the operator to perform specific operations
+                      on the cluster, like changing Aerospike configurations, without
+                      being hindered by these problematic pods. Remember to set MaxIgnorablePods
+                      back to 0 once the required operation is done. This makes sure
+                      that later on, all pods are properly counted when evaluating
+                      the cluster stability.
                     x-kubernetes-int-or-string: true
                   namespaces:
                     description: List of Aerospike namespaces for which rack feature
diff --git a/test/aero_info.go b/test/aero_info.go
index 4156a99a6..e7092e019 100644
--- a/test/aero_info.go
+++ b/test/aero_info.go
@@ -288,6 +288,40 @@ func getNodeList(ctx goctx.Context, k8sClient client.Client) (
 	return nodeList, nil
 }
 
+func cordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
+	for idx := range nodes {
+		// fetch the latest node object to avoid object conflict
+		if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
+			return err
+		}
+
+		nodes[idx].Spec.Unschedulable = true
+
+		if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func uncordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
+	for idx := range nodes {
+		// fetch the latest node object to avoid object conflict
+		if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
+			return err
+		}
+
+		nodes[idx].Spec.Unschedulable = false
+
+		if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
 func getZones(ctx goctx.Context, k8sClient client.Client) ([]string, error) {
 	unqZones := map[string]int{}
 
diff --git a/test/cluster_helper.go b/test/cluster_helper.go
index 599b5428a..b1263a8c7 100644
--- a/test/cluster_helper.go
+++ b/test/cluster_helper.go
@@ -4,6 +4,7 @@ import (
 	goctx "context"
 	"errors"
 	"fmt"
+	"reflect"
 	"strconv"
 	"time"
 
@@ -541,6 +542,27 @@ func validateMigrateFillDelay(
 	return err
 }
 
+func validateDirtyVolumes(
+	ctx goctx.Context, k8sClient client.Client,
+	clusterNamespacedName types.NamespacedName, expectedVolumes []string,
+) error {
+	aeroCluster, err := getCluster(k8sClient, ctx, clusterNamespacedName)
+	if err != nil {
+		return err
+	}
+
+	for podName := range aeroCluster.Status.Pods {
+		if !reflect.DeepEqual(aeroCluster.Status.Pods[podName].DirtyVolumes, expectedVolumes) {
+			return fmt.Errorf(
+				"dirtyVolumes mismatch, expected: %v, found %v", expectedVolumes,
+				aeroCluster.Status.Pods[podName].DirtyVolumes,
+			)
+		}
+	}
+
+	return nil
+}
+
 func upgradeClusterTest(
 	k8sClient client.Client, ctx goctx.Context,
 	clusterNamespacedName types.NamespacedName, image string,
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 57ee9e823..174b3523a 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -3,6 +3,7 @@ package test
 import (
 	goctx "context"
 	"fmt"
+	"time"
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -12,6 +13,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/intstr"
 
 	asdbv1 "github.com/aerospike/aerospike-kubernetes-operator/api/v1"
+	"github.com/aerospike/aerospike-kubernetes-operator/pkg/utils"
 )
 
 var _ = Describe(
@@ -47,8 +49,8 @@ var _ = Describe(
 		// 	},
 		// )
 		Context(
-			"DeployClusterWithIgnorePodList", func() {
-				clusterWithIgnorePodList(ctx)
+			"DeployClusterWithMaxIgnorablePod", func() {
+				clusterWithMaxIgnorablePod(ctx)
 			},
 		)
 		Context(
@@ -130,9 +132,9 @@ func ScaleDownWithMigrateFillDelay(ctx goctx.Context) {
 	)
 }
 
-func clusterWithIgnorePodList(ctx goctx.Context) {
+func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 	Context(
-		"UpdateClusterWithIgnorePodList", func() {
+		"UpdateClusterWithMaxIgnorablePodAndPendingPod", func() {
 			clusterNamespacedName := getNamespacedName(
 				"ignore-pod-cluster", namespace,
 			)
@@ -140,31 +142,80 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 			var (
 				aeroCluster *asdbv1.AerospikeCluster
 				err         error
+				nodeList    = &v1.NodeList{}
+				podList     = &v1.PodList{}
+				nodeToDrain int
+			)
 
-				testClusterLifecycle = func(ignorePodName string) {
-					By(fmt.Sprintf("Fail %s aerospike pod", ignorePodName))
-					pod := &v1.Pod{}
+			BeforeEach(
+				func() {
+					nodeList, err = getNodeList(ctx, k8sClient)
+					Expect(err).ToNot(HaveOccurred())
+					nodeToDrain = len(nodeList.Items) / 2
+					size := len(nodeList.Items) - nodeToDrain
 
-					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
-						Namespace: clusterNamespacedName.Namespace}, pod)
+					err = cordonNodes(ctx, k8sClient, nodeList.Items[:nodeToDrain])
 					Expect(err).ToNot(HaveOccurred())
 
-					pod.Spec.Containers[0].Image = "wrong-image"
-					err = k8sClient.Update(ctx, pod)
+					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, int32(size))
+					nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+					nsList = append(nsList, getNonSCNamespaceConfig("bar", "/test/dev/xvdf1"))
+					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
+
+					aeroCluster.Spec.Storage.Volumes = append(aeroCluster.Spec.Storage.Volumes,
+						asdbv1.VolumeSpec{
+							Name: "bar",
+							Source: asdbv1.VolumeSource{
+								PersistentVolume: &asdbv1.PersistentVolumeSpec{
+									Size:         resource.MustParse("1Gi"),
+									StorageClass: storageClass,
+									VolumeMode:   v1.PersistentVolumeBlock,
+								},
+							},
+							Aerospike: &asdbv1.AerospikeServerVolumeAttachment{
+								Path: "/test/dev/xvdf1",
+							},
+						},
+					)
+					racks := getDummyRackConf(1, 2)
+					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{
+						Namespaces: []string{scNamespace}, Racks: racks}
+					aeroCluster.Spec.PodSpec.MultiPodPerHost = false
+					err = deployCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					By("Set IgnorePodList and scale down 1 pod")
-					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					// make the node unschedulable and delete the pod to make it pending
+					By(fmt.Sprintf("Drain the node %s", nodeList.Items[nodeToDrain].Name))
+					err = cordonNodes(ctx, k8sClient, []v1.Node{nodeList.Items[nodeToDrain]})
 					Expect(err).ToNot(HaveOccurred())
-					val := intstr.FromInt(1)
-					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = &val
-					aeroCluster.Spec.Size--
-					err = updateCluster(k8sClient, ctx, aeroCluster)
+
+					podList, err = getPodList(aeroCluster, k8sClient)
+					Expect(err).ToNot(HaveOccurred())
+					for idx := range podList.Items {
+						if podList.Items[idx].Spec.NodeName == nodeList.Items[nodeToDrain].Name {
+							Expect(k8sClient.Delete(ctx, &podList.Items[idx])).NotTo(HaveOccurred())
+						}
+					}
+				},
+			)
+
+			AfterEach(
+				func() {
+					// Uncordon all nodes
+					err = uncordonNodes(ctx, k8sClient, nodeList.Items)
+					Expect(err).ToNot(HaveOccurred())
+					err = deleteCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
+				},
+			)
 
-					By("Rolling restart cluster")
+			It(
+				"Should allow cluster operations with pending pod", func() {
+					By("Set MaxIgnorablePod and Rolling restart cluster")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
+					val := intstr.FromInt(1)
+					aeroCluster.Spec.RackConfig.MaxIgnorablePods = &val
 					aeroCluster.Spec.AerospikeConfig.Value["service"].(map[string]interface{})["proto-fd-max"] = int64(18000)
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
@@ -177,75 +228,102 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					By("Scale up")
+					By("Verify pending pod")
+					podList, err = getPodList(aeroCluster, k8sClient)
+
+					var counter int
+
+					for idx := range podList.Items {
+						if podList.Items[idx].Status.Phase == v1.PodPending {
+							counter++
+						}
+					}
+					// There should be only one pending pod
+					Expect(counter).To(Equal(1))
+
+					By("Scale down 1 pod")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.Size++
+					aeroCluster.Spec.Size--
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName))
-					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
-						Namespace: clusterNamespacedName.Namespace}, pod)
+					By("Verify if all pods are running")
+					podList, err = getPodList(aeroCluster, k8sClient)
 					Expect(err).ToNot(HaveOccurred())
-					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
-					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
 
-					By(fmt.Sprintf(
-						"Remove pod from IgnorePodList and verify pod %s is in running state", ignorePodName))
+					for idx := range podList.Items {
+						Expect(utils.IsPodRunningAndReady(&podList.Items[idx])).To(BeTrue())
+					}
+				},
+			)
+
+			It(
+				"Should allow namespace addition and removal with pending pod", func() {
+					By("Set MaxIgnorablePod and Rolling restart by removing namespace")
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = nil
+					val := intstr.FromInt(1)
+					aeroCluster.Spec.RackConfig.MaxIgnorablePods = &val
+					nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+					nsList = nsList[:len(nsList)-1]
+					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
-						Namespace: clusterNamespacedName.Namespace}, pod)
+					err = validateDirtyVolumes(ctx, k8sClient, clusterNamespacedName, []string{"bar"})
+					Expect(err).ToNot(HaveOccurred())
+
+					By("RollingRestart by re-using previously removed namespace storage")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeTrue())
-					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeTrue())
-					Expect(pod.Spec.Containers[0].Image).To(Equal(newImage))
-				}
+					nsList = aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+					nsList = append(nsList, getNonSCNamespaceConfig("bar", "/test/dev/xvdf1"))
+					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
+
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+		},
+	)
+
+	Context(
+		"UpdateClusterWithMaxIgnorablePodAndFailedPod", func() {
+			clusterNamespacedName := getNamespacedName(
+				"ignore-pod-cluster", namespace,
+			)
+
+			var (
+				aeroCluster *asdbv1.AerospikeCluster
 			)
 
 			BeforeEach(
 				func() {
 					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4)
+					aeroCluster.Spec.AerospikeConfig = getSCAndNonSCAerospikeConfig()
 					racks := getDummyRackConf(1, 2)
-					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks}
-					err = deployCluster(k8sClient, ctx, aeroCluster)
+					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{
+						Namespaces: []string{scNamespace}, Racks: racks}
+					err := deployCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 				},
 			)
 
 			AfterEach(
 				func() {
-					err = deleteCluster(k8sClient, ctx, aeroCluster)
+					err := deleteCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 				},
 			)
 
-			It(
-				"Should allow cluster operations with random failed pod", func() {
-					// test with failed pod in between statefulset replicas
-					testClusterLifecycle(clusterNamespacedName.Name + "-2-0")
-				},
-			)
-
-			It(
-				"Should allow cluster operations with sequential(last replica) failed pod", func() {
-					// test with last replica of statefulset as failed pod
-					testClusterLifecycle(clusterNamespacedName.Name + "-1-1")
-				},
-			)
-
 			It(
 				"Should allow rack deletion with failed pods in different rack", func() {
 					By("Fail 1-1 aerospike pod")
 					ignorePodName := clusterNamespacedName.Name + "-1-1"
 					pod := &v1.Pod{}
 
-					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+					err := k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
 						Namespace: clusterNamespacedName.Namespace}, pod)
 					Expect(err).ToNot(HaveOccurred())
 
@@ -257,17 +335,24 @@ func clusterWithIgnorePodList(ctx goctx.Context) {
 					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
 					val := intstr.FromInt(1)
-					aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods = &val
-					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: getDummyRackConf(1)}
+					aeroCluster.Spec.RackConfig.MaxIgnorablePods = &val
+					aeroCluster.Spec.RackConfig.Racks = getDummyRackConf(1)
 					err = updateCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 
-					By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName))
-					err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
-						Namespace: clusterNamespacedName.Namespace}, pod)
-					Expect(err).ToNot(HaveOccurred())
-					Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse())
-					Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse())
+					By(fmt.Sprintf("Verify if failed pod %s is automatically recovered", ignorePodName))
+					Eventually(func() bool {
+						err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+							Namespace: clusterNamespacedName.Namespace}, pod)
+
+						return *pod.Status.ContainerStatuses[0].Started && pod.Status.ContainerStatuses[0].Ready
+					}, 1*time.Minute).Should(BeTrue())
+
+					Eventually(func() error {
+						return InterceptGomegaFailure(func() {
+							validateRoster(k8sClient, ctx, clusterNamespacedName, scNamespace)
+						})
+					}, 4*time.Minute).Should(BeNil())
 				},
 			)
 		},

From ba63462f9dbf3dd402e0c1148fb5ea3cc9099a6e Mon Sep 17 00:00:00 2001
From: Abhisek Dwivedi <adwivedi@aerospike.com>
Date: Fri, 1 Dec 2023 00:12:34 +0530
Subject: [PATCH 13/17] cleanup go.sum

---
 go.sum | 2 --
 1 file changed, 2 deletions(-)

diff --git a/go.sum b/go.sum
index 25c3d901b..abd92dab8 100644
--- a/go.sum
+++ b/go.sum
@@ -598,8 +598,6 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym
 github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
 github.com/aerospike/aerospike-client-go/v6 v6.14.0 h1:Z3FcGWJda1sagzdc6Akz4EJ13Pq55Uyn6qtFLrVUDd0=
 github.com/aerospike/aerospike-client-go/v6 v6.14.0/go.mod h1:/0Wm81GhMqem+9flWcpazPKoRfjFeG6WrQdXGiMNi0A=
-github.com/aerospike/aerospike-management-lib v0.0.0-20231107182540-fef71e1f5946 h1:wwCzPj4qk4EfdISK6tzNoEwSLg9vbeqBloNmhfB8mNo=
-github.com/aerospike/aerospike-management-lib v0.0.0-20231107182540-fef71e1f5946/go.mod h1:LPOsGG8okRSH4hN9Y8VXFzsfIpBDj2WKEsI/f6wxwaw=
 github.com/aerospike/aerospike-management-lib v0.0.0-20231129055344-b6aff63f1dbb h1:ykX3ElBNT/VOUhw/+5+jiFnWw3LSbPfl6eRrhQzBBFk=
 github.com/aerospike/aerospike-management-lib v0.0.0-20231129055344-b6aff63f1dbb/go.mod h1:LPOsGG8okRSH4hN9Y8VXFzsfIpBDj2WKEsI/f6wxwaw=
 github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY=

From 188013caea631342ec12e19630a089e71d81c4b4 Mon Sep 17 00:00:00 2001
From: Sudhanshu Ranjan <sudhanshu@aerospike.com>
Date: Sat, 2 Dec 2023 12:32:45 +0530
Subject: [PATCH 14/17] Fixed tests

---
 controllers/pod.go        |   8 +-
 controllers/rack.go       |  43 +++++---
 controllers/reconciler.go |  56 ++++++-----
 test/cluster_test.go      | 205 ++++++++++++++++++--------------------
 test/utils.go             |  11 +-
 5 files changed, 167 insertions(+), 156 deletions(-)

diff --git a/controllers/pod.go b/controllers/pod.go
index 3fb2f840e..b5ef2a62e 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -650,8 +650,8 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 
 // getIgnorablePods returns pods:
 // 1. From racksToDelete that are currently not running and can be ignored in stability checks.
-// 2. Failed/pending pods identified using maxIgnorablePods field and can be ignored from stability checks.
-func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configureRacks []RackState) (
+// 2. Failed/pending pods from the configuredRacks identified using maxIgnorablePods field and can be ignored from stability checks.
+func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configuredRacks []RackState) (
 	sets.Set[string], error,
 ) {
 	ignorablePodNames := sets.Set[string]{}
@@ -670,8 +670,8 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
 		}
 	}
 
-	for idx := range configureRacks {
-		rack := &configureRacks[idx]
+	for idx := range configuredRacks {
+		rack := &configuredRacks[idx]
 
 		failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
 			r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, rack.Size, false,
diff --git a/controllers/rack.go b/controllers/rack.go
index e03029ec0..66e269349 100644
--- a/controllers/rack.go
+++ b/controllers/rack.go
@@ -420,31 +420,44 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
 	}
 
 	if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
-		podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
-		if err != nil {
-			return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
+		if res := r.handleNSOrDeviceRemovalForIgnorablePods(rackState, ignorablePodNames); !res.isSuccess {
+			return found, res
 		}
-		// Filter ignoredPods to update their dirtyVolumes in the status.
-		// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
-		// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
-		// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
-		var ignoredPod []*corev1.Pod
+	}
 
-		for idx := range podList {
-			pod := podList[idx]
+	return found, reconcileSuccess()
+}
+
+func (r *SingleClusterReconciler) handleNSOrDeviceRemovalForIgnorablePods(
+	rackState *RackState, ignorablePodNames sets.Set[string],
+) reconcileResult {
+	podList, err := r.getOrderedRackPodList(rackState.Rack.ID)
+	if err != nil {
+		return reconcileError(fmt.Errorf("failed to list pods: %v", err))
+	}
+	// Filter ignoredPods to update their dirtyVolumes in the status.
+	// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
+	// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
+	// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
+	var ignoredPod []*corev1.Pod
+
+	for idx := range podList {
+		pod := podList[idx]
+		// Pods, that are not in status are not even initialized, so no need to update dirtyVolumes.
+		if _, ok := r.aeroCluster.Status.Pods[pod.Name]; ok {
 			if ignorablePodNames.Has(pod.Name) {
 				ignoredPod = append(ignoredPod, pod)
 			}
 		}
+	}
 
-		if len(ignoredPod) > 0 {
-			if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
-				return found, reconcileError(err)
-			}
+	if len(ignoredPod) > 0 {
+		if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
+			return reconcileError(err)
 		}
 	}
 
-	return found, reconcileSuccess()
+	return reconcileSuccess()
 }
 
 func (r *SingleClusterReconciler) reconcileRack(
diff --git a/controllers/reconciler.go b/controllers/reconciler.go
index 4b1df97a9..58b4417bb 100644
--- a/controllers/reconciler.go
+++ b/controllers/reconciler.go
@@ -218,42 +218,50 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
 
 	// Try to recover pods only when MaxIgnorablePods is set
 	if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
-		podList, gErr := r.getClusterPodList()
-		if gErr != nil {
-			r.Log.Error(gErr, "Failed to get cluster pod list")
-			return reconcile.Result{}, gErr
+		if res := r.recoverIgnorablePods(); !res.isSuccess {
+			return res.getResult()
 		}
+	}
 
-		r.Log.Info("Try to recover failed/pending pods if any")
+	r.Log.Info("Reconcile completed successfully")
 
-		var anyPodFailed bool
-		// Try to recover failed/pending pods by deleting them
-		for idx := range podList.Items {
-			if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
-				anyPodFailed = true
+	return reconcile.Result{}, nil
+}
 
-				if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
-					return reconcile.Result{}, err
-				}
+func (r *SingleClusterReconciler) recoverIgnorablePods() reconcileResult {
+	podList, gErr := r.getClusterPodList()
+	if gErr != nil {
+		r.Log.Error(gErr, "Failed to get cluster pod list")
+		return reconcileError(gErr)
+	}
 
-				if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
-					r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
-					return reconcile.Result{}, err
-				}
+	r.Log.Info("Try to recover failed/pending pods if any")
 
-				r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
+	var anyPodFailed bool
+	// Try to recover failed/pending pods by deleting them
+	for idx := range podList.Items {
+		if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
+			anyPodFailed = true
+
+			if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
+				return reconcileError(err)
+			}
+
+			if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
+				r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
+				return reconcileError(err)
 			}
-		}
 
-		if anyPodFailed {
-			r.Log.Info("Found failed/pending pod(s), requeuing")
-			return reconcile.Result{Requeue: true}, nil
+			r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
 		}
 	}
 
-	r.Log.Info("Reconcile completed successfully")
+	if anyPodFailed {
+		r.Log.Info("Found failed/pending pod(s), requeuing")
+		return reconcileRequeueAfter(0)
+	}
 
-	return reconcile.Result{}, nil
+	return reconcileSuccess()
 }
 
 func (r *SingleClusterReconciler) validateAndReconcileAccessControl(ignorablePodNames sets.Set[string]) error {
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 174b3523a..38f047e69 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -133,78 +133,40 @@ func ScaleDownWithMigrateFillDelay(ctx goctx.Context) {
 }
 
 func clusterWithMaxIgnorablePod(ctx goctx.Context) {
-	Context(
-		"UpdateClusterWithMaxIgnorablePodAndPendingPod", func() {
-			clusterNamespacedName := getNamespacedName(
-				"ignore-pod-cluster", namespace,
-			)
+	var (
+		aeroCluster *asdbv1.AerospikeCluster
+		err         error
+		nodeList    = &v1.NodeList{}
+		podList     = &v1.PodList{}
+		nodeToDrain int
+	)
 
-			var (
-				aeroCluster *asdbv1.AerospikeCluster
-				err         error
-				nodeList    = &v1.NodeList{}
-				podList     = &v1.PodList{}
-				nodeToDrain int
-			)
+	clusterNamespacedName := getNamespacedName(
+		"ignore-pod-cluster", namespace,
+	)
 
+	AfterEach(
+		func() {
+			err = deleteCluster(k8sClient, ctx, aeroCluster)
+			Expect(err).ToNot(HaveOccurred())
+		},
+	)
+
+	Context(
+		"UpdateClusterWithMaxIgnorablePodAndPendingPod", func() {
 			BeforeEach(
 				func() {
 					nodeList, err = getNodeList(ctx, k8sClient)
 					Expect(err).ToNot(HaveOccurred())
-					nodeToDrain = len(nodeList.Items) / 2
 					size := len(nodeList.Items) - nodeToDrain
 
-					err = cordonNodes(ctx, k8sClient, nodeList.Items[:nodeToDrain])
-					Expect(err).ToNot(HaveOccurred())
-
-					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, int32(size))
-					nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
-					nsList = append(nsList, getNonSCNamespaceConfig("bar", "/test/dev/xvdf1"))
-					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
-
-					aeroCluster.Spec.Storage.Volumes = append(aeroCluster.Spec.Storage.Volumes,
-						asdbv1.VolumeSpec{
-							Name: "bar",
-							Source: asdbv1.VolumeSource{
-								PersistentVolume: &asdbv1.PersistentVolumeSpec{
-									Size:         resource.MustParse("1Gi"),
-									StorageClass: storageClass,
-									VolumeMode:   v1.PersistentVolumeBlock,
-								},
-							},
-							Aerospike: &asdbv1.AerospikeServerVolumeAttachment{
-								Path: "/test/dev/xvdf1",
-							},
-						},
-					)
-					racks := getDummyRackConf(1, 2)
-					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{
-						Namespaces: []string{scNamespace}, Racks: racks}
-					aeroCluster.Spec.PodSpec.MultiPodPerHost = false
-					err = deployCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-
-					// make the node unschedulable and delete the pod to make it pending
-					By(fmt.Sprintf("Drain the node %s", nodeList.Items[nodeToDrain].Name))
-					err = cordonNodes(ctx, k8sClient, []v1.Node{nodeList.Items[nodeToDrain]})
-					Expect(err).ToNot(HaveOccurred())
-
-					podList, err = getPodList(aeroCluster, k8sClient)
-					Expect(err).ToNot(HaveOccurred())
-					for idx := range podList.Items {
-						if podList.Items[idx].Spec.NodeName == nodeList.Items[nodeToDrain].Name {
-							Expect(k8sClient.Delete(ctx, &podList.Items[idx])).NotTo(HaveOccurred())
-						}
-					}
-				},
-			)
+					deployClusterForMaxIgnorablePods(ctx, clusterNamespacedName, size)
 
-			AfterEach(
-				func() {
-					// Uncordon all nodes
-					err = uncordonNodes(ctx, k8sClient, nodeList.Items)
+					By("Scale up 1 pod to make that pod pending due to lack of k8s nodes")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 					Expect(err).ToNot(HaveOccurred())
-					err = deleteCluster(k8sClient, ctx, aeroCluster)
+					aeroCluster.Spec.Size++
+					err = k8sClient.Update(ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 				},
 			)
@@ -257,34 +219,6 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 					}
 				},
 			)
-
-			It(
-				"Should allow namespace addition and removal with pending pod", func() {
-					By("Set MaxIgnorablePod and Rolling restart by removing namespace")
-					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
-					Expect(err).ToNot(HaveOccurred())
-					val := intstr.FromInt(1)
-					aeroCluster.Spec.RackConfig.MaxIgnorablePods = &val
-					nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
-					nsList = nsList[:len(nsList)-1]
-					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
-					err = updateCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-
-					err = validateDirtyVolumes(ctx, k8sClient, clusterNamespacedName, []string{"bar"})
-					Expect(err).ToNot(HaveOccurred())
-
-					By("RollingRestart by re-using previously removed namespace storage")
-					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
-					Expect(err).ToNot(HaveOccurred())
-					nsList = aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
-					nsList = append(nsList, getNonSCNamespaceConfig("bar", "/test/dev/xvdf1"))
-					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
-
-					err = updateCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-				},
-			)
 		},
 	)
 
@@ -294,26 +228,9 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 				"ignore-pod-cluster", namespace,
 			)
 
-			var (
-				aeroCluster *asdbv1.AerospikeCluster
-			)
-
 			BeforeEach(
 				func() {
-					aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4)
-					aeroCluster.Spec.AerospikeConfig = getSCAndNonSCAerospikeConfig()
-					racks := getDummyRackConf(1, 2)
-					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{
-						Namespaces: []string{scNamespace}, Racks: racks}
-					err := deployCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
-				},
-			)
-
-			AfterEach(
-				func() {
-					err := deleteCluster(k8sClient, ctx, aeroCluster)
-					Expect(err).ToNot(HaveOccurred())
+					deployClusterForMaxIgnorablePods(ctx, clusterNamespacedName, 4)
 				},
 			)
 
@@ -355,8 +272,80 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 					}, 4*time.Minute).Should(BeNil())
 				},
 			)
+
+			It(
+				"Should allow namespace addition and removal with failed pod", func() {
+					By("Fail 1-1 aerospike pod")
+					ignorePodName := clusterNamespacedName.Name + "-1-1"
+					pod := &v1.Pod{}
+
+					err := k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName,
+						Namespace: clusterNamespacedName.Namespace}, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					pod.Spec.Containers[0].Image = "wrong-image"
+					err = k8sClient.Update(ctx, pod)
+					Expect(err).ToNot(HaveOccurred())
+
+					By("Set MaxIgnorablePod and Rolling restart by removing namespace")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					val := intstr.FromInt(1)
+					aeroCluster.Spec.RackConfig.MaxIgnorablePods = &val
+					nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+					nsList = nsList[:len(nsList)-1]
+					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+
+					err = validateDirtyVolumes(ctx, k8sClient, clusterNamespacedName, []string{"bar"})
+					Expect(err).ToNot(HaveOccurred())
+
+					By("RollingRestart by re-using previously removed namespace storage")
+					aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
+					Expect(err).ToNot(HaveOccurred())
+					nsList = aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+					nsList = append(nsList, getNonSCNamespaceConfig("barnew", "/test/dev/xvdf1"))
+					aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
+
+					err = updateCluster(k8sClient, ctx, aeroCluster)
+					Expect(err).ToNot(HaveOccurred())
+				},
+			)
+		},
+	)
+}
+
+func deployClusterForMaxIgnorablePods(ctx goctx.Context, clusterNamespacedName types.NamespacedName, size int) {
+	By("Deploying cluster")
+	aeroCluster := createDummyAerospikeCluster(clusterNamespacedName, int32(size))
+
+	// Add a nonsc namespace. This will be used to test dirty volumes
+	nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
+	nsList = append(nsList, getNonSCNamespaceConfig("bar", "/test/dev/xvdf1"))
+	aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = nsList
+
+	aeroCluster.Spec.Storage.Volumes = append(aeroCluster.Spec.Storage.Volumes,
+		asdbv1.VolumeSpec{
+			Name: "bar",
+			Source: asdbv1.VolumeSource{
+				PersistentVolume: &asdbv1.PersistentVolumeSpec{
+					Size:         resource.MustParse("1Gi"),
+					StorageClass: storageClass,
+					VolumeMode:   v1.PersistentVolumeBlock,
+				},
+			},
+			Aerospike: &asdbv1.AerospikeServerVolumeAttachment{
+				Path: "/test/dev/xvdf1",
+			},
 		},
 	)
+	racks := getDummyRackConf(1, 2)
+	aeroCluster.Spec.RackConfig = asdbv1.RackConfig{
+		Namespaces: []string{scNamespace}, Racks: racks}
+	aeroCluster.Spec.PodSpec.MultiPodPerHost = false
+	err := deployCluster(k8sClient, ctx, aeroCluster)
+	Expect(err).ToNot(HaveOccurred())
 }
 
 // Test cluster deployment with all image post 4.9.0
diff --git a/test/utils.go b/test/utils.go
index 6046917f2..f00c99b1d 100644
--- a/test/utils.go
+++ b/test/utils.go
@@ -296,11 +296,12 @@ func isClusterStateValid(
 		return false
 	}
 
-	// Validate pods
-	if len(newCluster.Status.Pods) != replicas {
-		pkgLog.Info("Cluster status doesn't have pod status for all nodes. Cluster status may not have fully updated")
-		return false
-	}
+	// TODO: This is not valid for tests where maxUnavailablePods flag is used. We can take the param in func to skip this check
+	// // Validate pods
+	// if len(newCluster.Status.Pods) != replicas {
+	// 	pkgLog.Info("Cluster status doesn't have pod status for all nodes. Cluster status may not have fully updated")
+	// 	return false
+	// }
 
 	for podName := range newCluster.Status.Pods {
 		if newCluster.Status.Pods[podName].Aerospike.NodeID == "" {

From cfb86cf7db6df945c2a7804a4c22965b2708d5b8 Mon Sep 17 00:00:00 2001
From: Sudhanshu Ranjan <sudhanshu@aerospike.com>
Date: Sat, 2 Dec 2023 13:27:16 +0530
Subject: [PATCH 15/17] Fix lint issues

---
 controllers/pod.go   |  3 ++-
 test/aero_info.go    | 34 ----------------------------------
 test/cluster_test.go |  1 +
 test/utils.go        |  3 ++-
 4 files changed, 5 insertions(+), 36 deletions(-)

diff --git a/controllers/pod.go b/controllers/pod.go
index b5ef2a62e..1244e6f3e 100644
--- a/controllers/pod.go
+++ b/controllers/pod.go
@@ -650,7 +650,8 @@ func (r *SingleClusterReconciler) cleanupDanglingPodsRack(sts *appsv1.StatefulSe
 
 // getIgnorablePods returns pods:
 // 1. From racksToDelete that are currently not running and can be ignored in stability checks.
-// 2. Failed/pending pods from the configuredRacks identified using maxIgnorablePods field and can be ignored from stability checks.
+// 2. Failed/pending pods from the configuredRacks identified using maxIgnorablePods field and
+// can be ignored from stability checks.
 func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack, configuredRacks []RackState) (
 	sets.Set[string], error,
 ) {
diff --git a/test/aero_info.go b/test/aero_info.go
index e7092e019..4156a99a6 100644
--- a/test/aero_info.go
+++ b/test/aero_info.go
@@ -288,40 +288,6 @@ func getNodeList(ctx goctx.Context, k8sClient client.Client) (
 	return nodeList, nil
 }
 
-func cordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
-	for idx := range nodes {
-		// fetch the latest node object to avoid object conflict
-		if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
-			return err
-		}
-
-		nodes[idx].Spec.Unschedulable = true
-
-		if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func uncordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
-	for idx := range nodes {
-		// fetch the latest node object to avoid object conflict
-		if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
-			return err
-		}
-
-		nodes[idx].Spec.Unschedulable = false
-
-		if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
 func getZones(ctx goctx.Context, k8sClient client.Client) ([]string, error) {
 	unqZones := map[string]int{}
 
diff --git a/test/cluster_test.go b/test/cluster_test.go
index 38f047e69..db9e4fde7 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -318,6 +318,7 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 
 func deployClusterForMaxIgnorablePods(ctx goctx.Context, clusterNamespacedName types.NamespacedName, size int) {
 	By("Deploying cluster")
+
 	aeroCluster := createDummyAerospikeCluster(clusterNamespacedName, int32(size))
 
 	// Add a nonsc namespace. This will be used to test dirty volumes
diff --git a/test/utils.go b/test/utils.go
index f00c99b1d..07b885ac0 100644
--- a/test/utils.go
+++ b/test/utils.go
@@ -296,7 +296,8 @@ func isClusterStateValid(
 		return false
 	}
 
-	// TODO: This is not valid for tests where maxUnavailablePods flag is used. We can take the param in func to skip this check
+	// TODO: This is not valid for tests where maxUnavailablePods flag is used.
+	// We can take the param in func to skip this check
 	// // Validate pods
 	// if len(newCluster.Status.Pods) != replicas {
 	// 	pkgLog.Info("Cluster status doesn't have pod status for all nodes. Cluster status may not have fully updated")

From b1815676fd5dc2042f83128a182784c376d39ef8 Mon Sep 17 00:00:00 2001
From: Sudhanshu Ranjan <sudhanshu@aerospike.com>
Date: Sun, 3 Dec 2023 11:28:31 +0530
Subject: [PATCH 16/17] Fix rack management test

---
 test/rack_management_test.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/rack_management_test.go b/test/rack_management_test.go
index 68b0d0bfb..7edd0b93b 100644
--- a/test/rack_management_test.go
+++ b/test/rack_management_test.go
@@ -804,13 +804,16 @@ var _ = Describe(
 			)
 			BeforeEach(
 				func() {
-					aeroCluster := createDummyAerospikeCluster(clusterNamespacedName, 2)
+					nodes, err := getNodeList(ctx, k8sClient)
+					Expect(err).ToNot(HaveOccurred())
+
+					aeroCluster := createDummyAerospikeCluster(clusterNamespacedName, int32(len(nodes.Items)))
 					racks := getDummyRackConf(1, 2)
 					aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks}
 					aeroCluster.Spec.PodSpec.MultiPodPerHost = false
 
 					By("Deploying cluster")
-					err := deployCluster(k8sClient, ctx, aeroCluster)
+					err = deployCluster(k8sClient, ctx, aeroCluster)
 					Expect(err).ToNot(HaveOccurred())
 				},
 			)
@@ -831,22 +834,19 @@ var _ = Describe(
 				aeroCluster, err := getCluster(k8sClient, ctx, clusterNamespacedName)
 				Expect(err).ToNot(HaveOccurred())
 
-				nodes, err := getNodeList(ctx, k8sClient)
-				Expect(err).ToNot(HaveOccurred())
-
-				aeroCluster.Spec.Size = int32(len(nodes.Items) + 1)
+				aeroCluster.Spec.Size++
 
 				// scaleup, no need to wait for long
-				err = updateClusterWithTO(k8sClient, ctx, aeroCluster, time.Minute*2)
+				err = updateClusterWithTO(k8sClient, ctx, aeroCluster, time.Minute*1)
 				Expect(err).To(HaveOccurred())
 
 				By("Scaling down the cluster size, failed pods should recover")
 				aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName)
 				Expect(err).ToNot(HaveOccurred())
 
-				aeroCluster.Spec.Size = int32(len(nodes.Items) - 1)
+				aeroCluster.Spec.Size--
 
-				err = updateClusterWithTO(k8sClient, ctx, aeroCluster, time.Minute*10)
+				err = updateClusterWithTO(k8sClient, ctx, aeroCluster, time.Minute*2)
 				Expect(err).ToNot(HaveOccurred())
 			})
 		})

From ccffd64b8ff20e3f69b624c6ff1b3b2df589c326 Mon Sep 17 00:00:00 2001
From: Sudhanshu Ranjan <sudhanshu@aerospike.com>
Date: Mon, 4 Dec 2023 11:39:36 +0530
Subject: [PATCH 17/17] Fix test

---
 test/cluster_test.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/cluster_test.go b/test/cluster_test.go
index db9e4fde7..697256638 100644
--- a/test/cluster_test.go
+++ b/test/cluster_test.go
@@ -138,7 +138,6 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 		err         error
 		nodeList    = &v1.NodeList{}
 		podList     = &v1.PodList{}
-		nodeToDrain int
 	)
 
 	clusterNamespacedName := getNamespacedName(
@@ -158,7 +157,7 @@ func clusterWithMaxIgnorablePod(ctx goctx.Context) {
 				func() {
 					nodeList, err = getNodeList(ctx, k8sClient)
 					Expect(err).ToNot(HaveOccurred())
-					size := len(nodeList.Items) - nodeToDrain
+					size := len(nodeList.Items)
 
 					deployClusterForMaxIgnorablePods(ctx, clusterNamespacedName, size)