diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go index e7b01dcd3..bcbb7cd37 100644 --- a/api/v1/aerospikecluster_types.go +++ b/api/v1/aerospikecluster_types.go @@ -74,9 +74,10 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability // RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Roster Node BlockList" RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"` - // IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and - // are not considered part of cluster. This is only useful when there are some failed pods and operator is required - // to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored. + // IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability. + // Pods specified in this list are not considered part of the cluster. This is particularly useful when + // there are failed pods and the operator needs to perform certain operations on the cluster. Note that + // running pods included in this list will not be ignored. // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Ignore Pod List" IgnorePodList []string `json:"ignorePodList,omitempty"` } @@ -621,9 +622,10 @@ type AerospikeClusterStatusSpec struct { //nolint:govet // for readability SeedsFinderServices SeedsFinderServices `json:"seedsFinderServices,omitempty"` // RosterNodeBlockList is a list of blocked nodeIDs from roster in a strong-consistency setup RosterNodeBlockList []string `json:"rosterNodeBlockList,omitempty"` - // IgnorePodList is the list of pods which are ignored by the operator while checking the cluster stability and - // are not considered part of cluster. This is only useful when there are some failed pods and operator is required - // to do some operation on the cluster. If pods in running state are defined in this list, they are not ignored. + // IgnorePodList is a list of pods that the operator will ignore while assessing cluster stability. + // Pods specified in this list are not considered part of the cluster. This is particularly useful when + // there are failed pods and the operator needs to perform certain operations on the cluster. Note that + // running pods included in this list will not be ignored. IgnorePodList []string `json:"ignorePodList,omitempty"` } diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml index 30b71f75a..6d64a64b9 100644 --- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml +++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml @@ -278,12 +278,12 @@ spec: type: string type: object ignorePodList: - description: IgnorePodList is the list of pods which are ignored by - the operator while checking the cluster stability and are not considered - part of cluster. This is only useful when there are some failed - pods and operator is required to do some operation on the cluster. - If pods in running state are defined in this list, they are not - ignored. + description: IgnorePodList is a list of pods that the operator will + ignore while assessing cluster stability. Pods specified in this + list are not considered part of the cluster. This is particularly + useful when there are failed pods and the operator needs to perform + certain operations on the cluster. Note that running pods included + in this list will not be ignored. items: type: string type: array @@ -8897,12 +8897,12 @@ spec: type: string type: object ignorePodList: - description: IgnorePodList is the list of pods which are ignored by - the operator while checking the cluster stability and are not considered - part of cluster. This is only useful when there are some failed - pods and operator is required to do some operation on the cluster. - If pods in running state are defined in this list, they are not - ignored. + description: IgnorePodList is a list of pods that the operator will + ignore while assessing cluster stability. Pods specified in this + list are not considered part of the cluster. This is particularly + useful when there are failed pods and the operator needs to perform + certain operations on the cluster. Note that running pods included + in this list will not be ignored. items: type: string type: array diff --git a/controllers/rack.go b/controllers/rack.go index 9493eddf1..263a4e380 100644 --- a/controllers/rack.go +++ b/controllers/rack.go @@ -661,6 +661,11 @@ func (r *SingleClusterReconciler) upgradeRack(statefulSet *appsv1.StatefulSet, r pod := podList[idx] r.Log.Info("Check if pod needs upgrade or not", "podName", pod.Name) + if ignorablePodNames.Has(pod.Name) { + r.Log.Info("Pod found in ignore pod list, skipping", "podName", pod.Name) + continue + } + if r.isPodUpgraded(pod) { r.Log.Info("Pod doesn't need upgrade", "podName", pod.Name) continue @@ -948,6 +953,11 @@ func (r *SingleClusterReconciler) rollingRestartRack(found *appsv1.StatefulSet, for idx := range podList { pod := podList[idx] + if ignorablePodNames.Has(pod.Name) { + r.Log.Info("Pod found in ignore pod list, skipping", "podName", pod.Name) + continue + } + restartType := restartTypeMap[pod.Name] if restartType == noRestart { r.Log.Info("This Pod doesn't need rolling restart, Skip this", "pod", pod.Name) diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml index 30b71f75a..6d64a64b9 100644 --- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml +++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml @@ -278,12 +278,12 @@ spec: type: string type: object ignorePodList: - description: IgnorePodList is the list of pods which are ignored by - the operator while checking the cluster stability and are not considered - part of cluster. This is only useful when there are some failed - pods and operator is required to do some operation on the cluster. - If pods in running state are defined in this list, they are not - ignored. + description: IgnorePodList is a list of pods that the operator will + ignore while assessing cluster stability. Pods specified in this + list are not considered part of the cluster. This is particularly + useful when there are failed pods and the operator needs to perform + certain operations on the cluster. Note that running pods included + in this list will not be ignored. items: type: string type: array @@ -8897,12 +8897,12 @@ spec: type: string type: object ignorePodList: - description: IgnorePodList is the list of pods which are ignored by - the operator while checking the cluster stability and are not considered - part of cluster. This is only useful when there are some failed - pods and operator is required to do some operation on the cluster. - If pods in running state are defined in this list, they are not - ignored. + description: IgnorePodList is a list of pods that the operator will + ignore while assessing cluster stability. Pods specified in this + list are not considered part of the cluster. This is particularly + useful when there are failed pods and the operator needs to perform + certain operations on the cluster. Note that running pods included + in this list will not be ignored. items: type: string type: array diff --git a/test/cluster_test.go b/test/cluster_test.go index 186361170..9c50ad9cf 100644 --- a/test/cluster_test.go +++ b/test/cluster_test.go @@ -139,36 +139,15 @@ func clusterWithIgnorePodList(ctx goctx.Context) { var ( aeroCluster *asdbv1.AerospikeCluster err error - ) - - BeforeEach( - func() { - aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4) - racks := getDummyRackConf(1, 2) - aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks} - err = deployCluster(k8sClient, ctx, aeroCluster) - Expect(err).ToNot(HaveOccurred()) - }, - ) - - AfterEach( - func() { - err = deleteCluster(k8sClient, ctx, aeroCluster) - Expect(err).ToNot(HaveOccurred()) - }, - ) - It( - "Should allow cluster operations with failed pods", func() { - By("Fail 2-0 aerospike pod") + testClusterLifecycle = func(ignorePodName string) { + By(fmt.Sprintf("Fail %s aerospike pod", ignorePodName)) pod := &v1.Pod{} - ignorePodName := clusterNamespacedName.Name + "-2-0" err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName, Namespace: clusterNamespacedName.Namespace}, pod) Expect(err).ToNot(HaveOccurred()) - // This will lead to pod 2-0 pod in failed state pod.Spec.Containers[0].Image = "wrong-image" err = k8sClient.Update(ctx, pod) Expect(err).ToNot(HaveOccurred()) @@ -203,14 +182,15 @@ func clusterWithIgnorePodList(ctx goctx.Context) { err = updateCluster(k8sClient, ctx, aeroCluster) Expect(err).ToNot(HaveOccurred()) - By("Verify pod 2-0 is still in failed state") + By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName)) err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName, Namespace: clusterNamespacedName.Namespace}, pod) Expect(err).ToNot(HaveOccurred()) Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse()) Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse()) - By("Remove pod from IgnorePodList and verify pod 2-0 is in running state") + By(fmt.Sprintf( + "Remove pod from IgnorePodList and verify pod %s is in running state", ignorePodName)) aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName) Expect(err).ToNot(HaveOccurred()) aeroCluster.Spec.IgnorePodList = []string{} @@ -223,6 +203,68 @@ func clusterWithIgnorePodList(ctx goctx.Context) { Expect(*pod.Status.ContainerStatuses[0].Started).To(BeTrue()) Expect(pod.Status.ContainerStatuses[0].Ready).To(BeTrue()) Expect(pod.Spec.Containers[0].Image).To(Equal(newImage)) + } + ) + + BeforeEach( + func() { + aeroCluster = createDummyAerospikeCluster(clusterNamespacedName, 4) + racks := getDummyRackConf(1, 2) + aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: racks} + err = deployCluster(k8sClient, ctx, aeroCluster) + Expect(err).ToNot(HaveOccurred()) + }, + ) + + AfterEach( + func() { + err = deleteCluster(k8sClient, ctx, aeroCluster) + Expect(err).ToNot(HaveOccurred()) + }, + ) + + It( + "Should allow cluster operations with random failed pod", func() { + // test with failed pod in between statefulset replicas + testClusterLifecycle(clusterNamespacedName.Name + "-2-0") + }, + ) + + It( + "Should allow cluster operations with sequential(last replica) failed pod", func() { + // test with last replica of statefulset as failed pod + testClusterLifecycle(clusterNamespacedName.Name + "-1-1") + }, + ) + + It( + "Should allow rack deletion with failed pods in different rack", func() { + By("Fail 1-1 aerospike pod") + ignorePodName := clusterNamespacedName.Name + "-1-1" + pod := &v1.Pod{} + + err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName, + Namespace: clusterNamespacedName.Namespace}, pod) + Expect(err).ToNot(HaveOccurred()) + + pod.Spec.Containers[0].Image = "wrong-image" + err = k8sClient.Update(ctx, pod) + Expect(err).ToNot(HaveOccurred()) + + By("Delete rack with id 2") + aeroCluster, err = getCluster(k8sClient, ctx, clusterNamespacedName) + Expect(err).ToNot(HaveOccurred()) + aeroCluster.Spec.IgnorePodList = []string{ignorePodName} + aeroCluster.Spec.RackConfig = asdbv1.RackConfig{Racks: getDummyRackConf(1)} + err = updateCluster(k8sClient, ctx, aeroCluster) + Expect(err).ToNot(HaveOccurred()) + + By(fmt.Sprintf("Verify pod %s is still in failed state", ignorePodName)) + err = k8sClient.Get(ctx, types.NamespacedName{Name: ignorePodName, + Namespace: clusterNamespacedName.Namespace}, pod) + Expect(err).ToNot(HaveOccurred()) + Expect(*pod.Status.ContainerStatuses[0].Started).To(BeFalse()) + Expect(pod.Status.ContainerStatuses[0].Ready).To(BeFalse()) }, ) },