Skip to content

Commit

Permalink
Added/modified test-cases
Browse files Browse the repository at this point in the history
  • Loading branch information
abhishekdwivedi3060 committed Nov 30, 2023
1 parent ab029be commit 2989ad5
Show file tree
Hide file tree
Showing 12 changed files with 315 additions and 156 deletions.
17 changes: 10 additions & 7 deletions api/v1/aerospikecluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,14 +277,17 @@ type RackConfig struct { //nolint:govet // for readability
// RollingUpdateBatchSize is the percentage/number of rack pods that will be restarted simultaneously
// +optional
RollingUpdateBatchSize *intstr.IntOrString `json:"rollingUpdateBatchSize,omitempty"`
// MaxIgnorableFailedPods is the maximum percentage/number of rack pods that are in pending state due to scheduling
// issues. They are ignored while assessing cluster stability. Failed/pending pods identified using this value are
// not considered part of the cluster.
// This is particularly useful when there are failed/pending pods that cannot be recovered by updating the CR and
// the operator needs to perform certain operations on the cluster like Aerospike config change.
// Reset this value to 0 after the deployment is done, to avoid unintended consequences.
// MaxIgnorablePods is the maximum number/percentage of pending/failed pods in a rack that are ignored while
// assessing cluster stability. Pods identified using this value are not considered part of the cluster.
// Additionally, in SC mode clusters, these pods are removed from the roster.
// This is particularly useful when some pods are stuck in pending/failed state due to any scheduling issues and
// cannot be fixed by simply updating the CR.
// It enables the operator to perform specific operations on the cluster, like changing Aerospike configurations,
// without being hindered by these problematic pods.
// Remember to set MaxIgnorablePods back to 0 once the required operation is done.
// This makes sure that later on, all pods are properly counted when evaluating the cluster stability.
// +optional
MaxIgnorableFailedPods *intstr.IntOrString `json:"maxIgnorableFailedPods,omitempty"`
MaxIgnorablePods *intstr.IntOrString `json:"maxIgnorablePods,omitempty"`
}

// Rack specifies single rack config
Expand Down
8 changes: 4 additions & 4 deletions api/v1/aerospikecluster_validating_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,10 +638,10 @@ func (c *AerospikeCluster) validateRackConfig(_ logr.Logger) error {
}
}

// Validate MaxIgnorableFailedPods param
if c.Spec.RackConfig.MaxIgnorableFailedPods != nil {
if err := validateIntOrStringField(c.Spec.RackConfig.MaxIgnorableFailedPods,
"spec.rackConfig.maxIgnorableFailedPods"); err != nil {
// Validate MaxIgnorablePods param
if c.Spec.RackConfig.MaxIgnorablePods != nil {
if err := validateIntOrStringField(c.Spec.RackConfig.MaxIgnorablePods,
"spec.rackConfig.maxIgnorablePods"); err != nil {
return err
}
}
Expand Down
4 changes: 2 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 28 additions & 20 deletions config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4593,19 +4593,23 @@ spec:
Aerospike cluster. Pods will be deployed in given racks based on
given configuration
properties:
maxIgnorableFailedPods:
maxIgnorablePods:
anyOf:
- type: integer
- type: string
description: MaxIgnorableFailedPods is the maximum percentage/number
of rack pods that are in pending state due to scheduling issues.
They are ignored while assessing cluster stability. Failed/pending
pods identified using this value are not considered part of
the cluster. This is particularly useful when there are failed/pending
pods that cannot be recovered by updating the CR and the operator
needs to perform certain operations on the cluster like Aerospike
config change. Reset this value to 0 after the deployment is
done, to avoid unintended consequences.
description: MaxIgnorablePods is the maximum number/percentage
of pending/failed pods in a rack that are ignored while assessing
cluster stability. Pods identified using this value are not
considered part of the cluster. Additionally, in SC mode clusters,
these pods are removed from the roster. This is particularly
useful when some pods are stuck in pending/failed state due
to any scheduling issues and cannot be fixed by simply updating
the CR. It enables the operator to perform specific operations
on the cluster, like changing Aerospike configurations, without
being hindered by these problematic pods. Remember to set MaxIgnorablePods
back to 0 once the required operation is done. This makes sure
that later on, all pods are properly counted when evaluating
the cluster stability.
x-kubernetes-int-or-string: true
namespaces:
description: List of Aerospike namespaces for which rack feature
Expand Down Expand Up @@ -13344,19 +13348,23 @@ spec:
given configuration
nullable: true
properties:
maxIgnorableFailedPods:
maxIgnorablePods:
anyOf:
- type: integer
- type: string
description: MaxIgnorableFailedPods is the maximum percentage/number
of rack pods that are in pending state due to scheduling issues.
They are ignored while assessing cluster stability. Failed/pending
pods identified using this value are not considered part of
the cluster. This is particularly useful when there are failed/pending
pods that cannot be recovered by updating the CR and the operator
needs to perform certain operations on the cluster like Aerospike
config change. Reset this value to 0 after the deployment is
done, to avoid unintended consequences.
description: MaxIgnorablePods is the maximum number/percentage
of pending/failed pods in a rack that are ignored while assessing
cluster stability. Pods identified using this value are not
considered part of the cluster. Additionally, in SC mode clusters,
these pods are removed from the roster. This is particularly
useful when some pods are stuck in pending/failed state due
to any scheduling issues and cannot be fixed by simply updating
the CR. It enables the operator to perform specific operations
on the cluster, like changing Aerospike configurations, without
being hindered by these problematic pods. Remember to set MaxIgnorablePods
back to 0 once the required operation is done. This makes sure
that later on, all pods are properly counted when evaluating
the cluster stability.
x-kubernetes-int-or-string: true
namespaces:
description: List of Aerospike namespaces for which rack feature
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,6 @@ spec:
the Aerospike cluster.
displayName: Aerospike Network Policy
path: aerospikeNetworkPolicy
- description: IgnorePodList is a list of pods that the operator will ignore
while assessing cluster stability. Pods specified in this list are not considered
part of the cluster. This is particularly useful when there are failed pods
and the operator needs to perform certain operations on the cluster. Note
that running pods included in this list will not be ignored.
displayName: Ignore Pod List
path: ignorePodList
- description: Aerospike server image
displayName: Server Image
path: image
Expand Down
2 changes: 1 addition & 1 deletion controllers/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ func (r *SingleClusterReconciler) getIgnorablePods(racksToDelete []asdbv1.Rack,
rack := &configureRacks[idx]

failedAllowed, _ := intstr.GetScaledValueFromIntOrPercent(
r.aeroCluster.Spec.RackConfig.MaxIgnorableFailedPods, rack.Size, false,
r.aeroCluster.Spec.RackConfig.MaxIgnorablePods, rack.Size, false,
)

podList, err := r.getRackPodList(rack.Rack.ID)
Expand Down
28 changes: 15 additions & 13 deletions controllers/rack.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,22 +424,24 @@ func (r *SingleClusterReconciler) upgradeOrRollingRestartRack(found *appsv1.Stat
return found, reconcileError(fmt.Errorf("failed to list pods: %v", err))
}

// Filter ignoredPods to update their dirtyVolumes in the status.
// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
var ignoredPod []*corev1.Pod
if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
// Filter ignoredPods to update their dirtyVolumes in the status.
// IgnoredPods are skipped from upgrade/rolling restart, and as a result in case of device removal, dirtyVolumes
// are not updated in their pod status. This makes devices un-reusable as they cannot be cleaned up during init phase.
// So, explicitly add dirtyVolumes for ignoredPods, so that they can be cleaned in the init phase.
var ignoredPod []*corev1.Pod

for idx := range podList {
pod := podList[idx]
if ignorablePodNames.Has(pod.Name) {
ignoredPod = append(ignoredPod, pod)
for idx := range podList {
pod := podList[idx]
if ignorablePodNames.Has(pod.Name) {
ignoredPod = append(ignoredPod, pod)
}
}
}

if len(ignoredPod) > 0 {
if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
return found, reconcileError(err)
if len(ignoredPod) > 0 {
if err := r.handleNSOrDeviceRemoval(rackState, ignoredPod); err != nil {
return found, reconcileError(err)
}
}
}

Expand Down
48 changes: 26 additions & 22 deletions controllers/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,35 +216,39 @@ func (r *SingleClusterReconciler) Reconcile() (ctrl.Result, error) {
return reconcile.Result{}, err
}

podList, gErr := r.getClusterPodList()
if gErr != nil {
r.Log.Error(gErr, "Failed to get cluster pod list")
return reconcile.Result{}, gErr
}
// Try to recover pods only when MaxIgnorablePods is set
if r.aeroCluster.Spec.RackConfig.MaxIgnorablePods != nil {
podList, gErr := r.getClusterPodList()
if gErr != nil {
r.Log.Error(gErr, "Failed to get cluster pod list")
return reconcile.Result{}, gErr
}

r.Log.Info("Try to recover failed/pending pods if any")
r.Log.Info("Try to recover failed/pending pods if any")

var anyPodFailed bool
// Try to recover failed/pending pods by deleting them
for idx := range podList.Items {
if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
anyPodFailed = true
var anyPodFailed bool
// Try to recover failed/pending pods by deleting them
for idx := range podList.Items {
if cErr := utils.CheckPodFailed(&podList.Items[idx]); cErr != nil {
anyPodFailed = true

if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
return reconcile.Result{}, err
}
if err := r.createOrUpdatePodServiceIfNeeded([]string{podList.Items[idx].Name}); err != nil {
return reconcile.Result{}, err
}

if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
return reconcile.Result{}, err
}
if err := r.Client.Delete(context.TODO(), &podList.Items[idx]); err != nil {
r.Log.Error(err, "Failed to delete pod", "pod", podList.Items[idx].Name)
return reconcile.Result{}, err
}

r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
r.Log.Info("Deleted pod", "pod", podList.Items[idx].Name)
}
}
}

if anyPodFailed {
return reconcile.Result{Requeue: true}, nil
if anyPodFailed {
r.Log.Info("Found failed/pending pod(s), requeuing")
return reconcile.Result{Requeue: true}, nil
}
}

r.Log.Info("Reconcile completed successfully")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4593,19 +4593,23 @@ spec:
Aerospike cluster. Pods will be deployed in given racks based on
given configuration
properties:
maxIgnorableFailedPods:
maxIgnorablePods:
anyOf:
- type: integer
- type: string
description: MaxIgnorableFailedPods is the maximum percentage/number
of rack pods that are in pending state due to scheduling issues.
They are ignored while assessing cluster stability. Failed/pending
pods identified using this value are not considered part of
the cluster. This is particularly useful when there are failed/pending
pods that cannot be recovered by updating the CR and the operator
needs to perform certain operations on the cluster like Aerospike
config change. Reset this value to 0 after the deployment is
done, to avoid unintended consequences.
description: MaxIgnorablePods is the maximum number/percentage
of pending/failed pods in a rack that are ignored while assessing
cluster stability. Pods identified using this value are not
considered part of the cluster. Additionally, in SC mode clusters,
these pods are removed from the roster. This is particularly
useful when some pods are stuck in pending/failed state due
to any scheduling issues and cannot be fixed by simply updating
the CR. It enables the operator to perform specific operations
on the cluster, like changing Aerospike configurations, without
being hindered by these problematic pods. Remember to set MaxIgnorablePods
back to 0 once the required operation is done. This makes sure
that later on, all pods are properly counted when evaluating
the cluster stability.
x-kubernetes-int-or-string: true
namespaces:
description: List of Aerospike namespaces for which rack feature
Expand Down Expand Up @@ -13344,19 +13348,23 @@ spec:
given configuration
nullable: true
properties:
maxIgnorableFailedPods:
maxIgnorablePods:
anyOf:
- type: integer
- type: string
description: MaxIgnorableFailedPods is the maximum percentage/number
of rack pods that are in pending state due to scheduling issues.
They are ignored while assessing cluster stability. Failed/pending
pods identified using this value are not considered part of
the cluster. This is particularly useful when there are failed/pending
pods that cannot be recovered by updating the CR and the operator
needs to perform certain operations on the cluster like Aerospike
config change. Reset this value to 0 after the deployment is
done, to avoid unintended consequences.
description: MaxIgnorablePods is the maximum number/percentage
of pending/failed pods in a rack that are ignored while assessing
cluster stability. Pods identified using this value are not
considered part of the cluster. Additionally, in SC mode clusters,
these pods are removed from the roster. This is particularly
useful when some pods are stuck in pending/failed state due
to any scheduling issues and cannot be fixed by simply updating
the CR. It enables the operator to perform specific operations
on the cluster, like changing Aerospike configurations, without
being hindered by these problematic pods. Remember to set MaxIgnorablePods
back to 0 once the required operation is done. This makes sure
that later on, all pods are properly counted when evaluating
the cluster stability.
x-kubernetes-int-or-string: true
namespaces:
description: List of Aerospike namespaces for which rack feature
Expand Down
34 changes: 34 additions & 0 deletions test/aero_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,40 @@ func getNodeList(ctx goctx.Context, k8sClient client.Client) (
return nodeList, nil
}

func cordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
for idx := range nodes {
// fetch the latest node object to avoid object conflict
if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
return err
}

nodes[idx].Spec.Unschedulable = true

if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
return err
}
}

return nil
}

func uncordonNodes(ctx goctx.Context, k8sClient client.Client, nodes []corev1.Node) error {
for idx := range nodes {
// fetch the latest node object to avoid object conflict
if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodes[idx].Name}, &nodes[idx]); err != nil {
return err
}

nodes[idx].Spec.Unschedulable = false

if err := k8sClient.Update(ctx, &nodes[idx]); err != nil {
return err
}
}

return nil
}

func getZones(ctx goctx.Context, k8sClient client.Client) ([]string, error) {
unqZones := map[string]int{}

Expand Down
22 changes: 22 additions & 0 deletions test/cluster_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
goctx "context"
"errors"
"fmt"
"reflect"
"strconv"
"time"

Expand Down Expand Up @@ -541,6 +542,27 @@ func validateMigrateFillDelay(
return err
}

func validateDirtyVolumes(
ctx goctx.Context, k8sClient client.Client,
clusterNamespacedName types.NamespacedName, expectedVolumes []string,
) error {
aeroCluster, err := getCluster(k8sClient, ctx, clusterNamespacedName)
if err != nil {
return err
}

for podName := range aeroCluster.Status.Pods {
if !reflect.DeepEqual(aeroCluster.Status.Pods[podName].DirtyVolumes, expectedVolumes) {
return fmt.Errorf(
"dirtyVolumes mismatch, expected: %v, found %v", expectedVolumes,
aeroCluster.Status.Pods[podName].DirtyVolumes,
)
}
}

return nil
}

func upgradeClusterTest(
k8sClient client.Client, ctx goctx.Context,
clusterNamespacedName types.NamespacedName, image string,
Expand Down
Loading

0 comments on commit 2989ad5

Please sign in to comment.