diff --git a/api/v1/aerospikecluster_types.go b/api/v1/aerospikecluster_types.go index 5fa0f1356..e697515e3 100644 --- a/api/v1/aerospikecluster_types.go +++ b/api/v1/aerospikecluster_types.go @@ -47,6 +47,15 @@ const ( AerospikeClusterError AerospikeClusterPhase = "Error" ) +// +kubebuilder:validation:Enum=Failed;PartiallyFailed;"" +type DynamicConfigUpdateStatus string + +const ( + Failed DynamicConfigUpdateStatus = "Failed" + PartiallyFailed DynamicConfigUpdateStatus = "PartiallyFailed" + Empty DynamicConfigUpdateStatus = "" +) + // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. // AerospikeClusterSpec defines the desired state of AerospikeCluster @@ -77,6 +86,11 @@ type AerospikeClusterSpec struct { //nolint:govet // for readability // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Aerospike Server Configuration" // +kubebuilder:pruning:PreserveUnknownFields AerospikeConfig *AerospikeConfigSpec `json:"aerospikeConfig"` + // EnableDynamicUpdate enables dynamic config update flow of the operator. + // If enabled, operator will try to update the Aerospike config dynamically. + // In case of inconsistent state during dynamic config update, operator falls back to rolling restart. + // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="EnableDynamicUpdate" + EnableDynamicUpdate *bool `json:"enableDynamicUpdate,omitempty"` // ValidationPolicy controls validation of the Aerospike cluster resource. // +operator-sdk:csv:customresourcedefinitions:type=spec,displayName="Validation Policy" ValidationPolicy *ValidationPolicySpec `json:"validationPolicy,omitempty"` @@ -872,8 +886,9 @@ type AerospikePodStatus struct { //nolint:govet // for readability // PodSpecHash is ripemd160 hash of PodSpec used by this pod PodSpecHash string `json:"podSpecHash"` - // DynamicConfigFailed is true if aerospike config change failed to apply dynamically. - DynamicConfigFailed bool `json:"dynamicConfigFailed,omitempty"` + // DynamicConfigUpdateStatus is the status of dynamic config update operation. + // Empty "" status means successful update. + DynamicConfigUpdateStatus DynamicConfigUpdateStatus `json:"dynamicConfigUpdateStatus,omitempty"` // IsSecurityEnabled is true if security is enabled in the pod IsSecurityEnabled bool `json:"isSecurityEnabled"` diff --git a/api/v1/utils.go b/api/v1/utils.go index f4062b48f..9050862d1 100644 --- a/api/v1/utils.go +++ b/api/v1/utils.go @@ -62,8 +62,8 @@ const ( AerospikeInitContainerName = "aerospike-init" AerospikeInitContainerRegistryEnvVar = "AEROSPIKE_KUBERNETES_INIT_REGISTRY" AerospikeInitContainerDefaultRegistry = "docker.io" - AerospikeInitContainerDefaultRegistryNamespace = "aerospike" - AerospikeInitContainerDefaultRepoAndTag = "aerospike-kubernetes-init:2.2.0-dev2" + AerospikeInitContainerDefaultRegistryNamespace = "abhishekdwivedi3060" + AerospikeInitContainerDefaultRepoAndTag = "aerospike-kubernetes-init:2.2.0-dev3" AerospikeAppLabel = "app" AerospikeCustomResourceLabel = "aerospike.com/cr" AerospikeRackIDLabel = "aerospike.com/rack-id" diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 3bd576ec1..f4a5f97a6 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -169,6 +169,11 @@ func (in *AerospikeClusterSpec) DeepCopyInto(out *AerospikeClusterSpec) { in, out := &in.AerospikeConfig, &out.AerospikeConfig *out = (*in).DeepCopy() } + if in.EnableDynamicUpdate != nil { + in, out := &in.EnableDynamicUpdate, &out.EnableDynamicUpdate + *out = new(bool) + **out = **in + } if in.ValidationPolicy != nil { in, out := &in.ValidationPolicy, &out.ValidationPolicy *out = new(ValidationPolicySpec) diff --git a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml index c977eb9b7..b25607fff 100644 --- a/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml +++ b/config/crd/bases/asdb.aerospike.com_aerospikeclusters.yaml @@ -281,6 +281,12 @@ spec: - customInterface type: string type: object + enableDynamicUpdate: + description: EnableDynamicUpdate enables dynamic config update flow + of the operator. If enabled, operator will try to update the Aerospike + config dynamically. In case of inconsistent state during dynamic + config update, operator falls back to rolling restart. + type: boolean image: description: Aerospike server image type: string @@ -14238,10 +14244,15 @@ spec: items: type: string type: array - dynamicConfigFailed: - description: DynamicConfigFailed is true if aerospike config - change failed to apply dynamically. - type: boolean + dynamicConfigUpdateStatus: + description: DynamicConfigUpdateStatus is the status of dynamic + config update operation. Empty "" status means successful + update. + enum: + - Failed + - PartiallyFailed + - "" + type: string hostExternalIP: description: HostExternalIP of the K8s host this pod is scheduled on. diff --git a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml index 64dcdffae..cd2b711bb 100644 --- a/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/aerospike-kubernetes-operator.clusterserviceversion.yaml @@ -47,6 +47,12 @@ spec: the Aerospike cluster. displayName: Aerospike Network Policy path: aerospikeNetworkPolicy + - description: EnableDynamicUpdate enables dynamic config update flow of the + operator. If enabled, operator will try to update the Aerospike config dynamically. + In case of inconsistent state during dynamic config update, operator falls + back to rolling restart. + displayName: EnableDynamicUpdate + path: enableDynamicUpdate - description: Aerospike server image displayName: Server Image path: image diff --git a/controllers/aero_info_calls.go b/controllers/aero_info_calls.go index df016e31a..95cd526c6 100644 --- a/controllers/aero_info_calls.go +++ b/controllers/aero_info_calls.go @@ -348,14 +348,29 @@ func (r *SingleClusterReconciler) setDynamicConfig( r.Log.Info("Generated dynamic config commands", "commands", fmt.Sprintf("%v", asConfCmds), "pod", podName) - if err := deployment.SetConfigCommandsOnHosts(r.Log, r.getClientPolicy(), allHostConns, + if cmdStatus, err := deployment.SetConfigCommandsOnHosts(r.Log, r.getClientPolicy(), allHostConns, []*deployment.HostConn{host}, asConfCmds); err != nil { + errorStatus := asdbv1.Failed + + // Calculate the number of passed commands and based on that set Failed or PartiallyFailed status. + var passedCounter int + + for _, passed := range cmdStatus { + if passed { + passedCounter++ + } + } + + if passedCounter != 0 { + errorStatus = asdbv1.PartiallyFailed + } + var patches []jsonpatch.PatchOperation patch := jsonpatch.PatchOperation{ Operation: "replace", - Path: "/status/pods/" + podName + "/dynamicConfigFailed", - Value: true, + Path: "/status/pods/" + podName + "/dynamicConfigUpdateStatus", + Value: errorStatus, } patches = append(patches, patch) diff --git a/controllers/pod.go b/controllers/pod.go index c2e0f9e33..d6b60983a 100644 --- a/controllers/pod.go +++ b/controllers/pod.go @@ -119,8 +119,11 @@ func (r *SingleClusterReconciler) getRollingRestartTypeMap(rackState *RackState, // If version >= 6.0.0, then we can update config dynamically. if v >= 0 { - // If dynamic commands have failed in previous retry, then we should not try to update config dynamically. - if !podStatus.DynamicConfigFailed { + // If EnableDynamicUpdate is set and dynamic config command exec partially failed in previous try + // then skip dynamic config update and fall back to rolling restart. + // Continue with dynamic config update in case of Failed DynamicConfigUpdateStatus + if asdbv1.GetBool(r.aeroCluster.Spec.EnableDynamicUpdate) && + podStatus.DynamicConfigUpdateStatus != asdbv1.PartiallyFailed { // Fetching all dynamic config change. dynamicConfDiffPerPod[pods[idx].Name], err = r.handleDynamicConfigChange(rackState, pods[idx], version) if err != nil { @@ -135,7 +138,8 @@ func (r *SingleClusterReconciler) getRollingRestartTypeMap(rackState *RackState, restartTypeMap[pods[idx].Name] = r.getRollingRestartTypePod(rackState, pods[idx], confMap, addedNSDevices, len(dynamicConfDiffPerPod[pods[idx].Name]) > 0) - if podStatus.DynamicConfigFailed { + // Fallback to rolling restart in case of partial failure to recover with the desired Aerospike config + if podStatus.DynamicConfigUpdateStatus == asdbv1.PartiallyFailed { restartTypeMap[pods[idx].Name] = mergeRestartType(restartTypeMap[pods[idx].Name], quickRestart) } } diff --git a/controllers/rack.go b/controllers/rack.go index c442a9c3b..babde9dbf 100644 --- a/controllers/rack.go +++ b/controllers/rack.go @@ -122,6 +122,8 @@ func (r *SingleClusterReconciler) reconcileRacks() reconcileResult { } r.Log.Info("Restarted the failed pods in the Rack", "rackID", state.Rack.ID, "failedPods", failedPods) + // Requeue after 1 second to fetch latest CR object with updated pod status + return reconcileRequeueAfter(1) } } diff --git a/go.mod b/go.mod index df7b4e00c..2f203cc30 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.21 toolchain go1.21.8 require ( - github.com/aerospike/aerospike-management-lib v1.3.1-0.20240404063536-2adfbedf9687 + github.com/aerospike/aerospike-management-lib v1.3.1-0.20240412042741-c7d631bbfa43 github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d github.com/evanphx/json-patch v4.12.0+incompatible github.com/go-logr/logr v1.3.0 diff --git a/go.sum b/go.sum index 947f1be32..15fbf217a 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/aerospike/aerospike-management-lib v1.2.1-0.20240325134810-f8046fe987 github.com/aerospike/aerospike-management-lib v1.2.1-0.20240325134810-f8046fe9872e/go.mod h1:E4dk798IikCp9a8fugpYoeQVIXuvdxogHvt6sKhaORQ= github.com/aerospike/aerospike-management-lib v1.3.1-0.20240404063536-2adfbedf9687 h1:d7oDvHmiKhq4rzcD/w3z9tP3wH0+iaDvxKDk3IYuqeU= github.com/aerospike/aerospike-management-lib v1.3.1-0.20240404063536-2adfbedf9687/go.mod h1:E4dk798IikCp9a8fugpYoeQVIXuvdxogHvt6sKhaORQ= +github.com/aerospike/aerospike-management-lib v1.3.1-0.20240412042741-c7d631bbfa43 h1:7xdFCD3e1rdy5GiznyTrxinShah6cNJAMZZAphGEPZs= +github.com/aerospike/aerospike-management-lib v1.3.1-0.20240412042741-c7d631bbfa43/go.mod h1:E4dk798IikCp9a8fugpYoeQVIXuvdxogHvt6sKhaORQ= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d h1:Byv0BzEl3/e6D5CLfI0j/7hiIEtvGVFPCZ7Ei2oq8iQ= diff --git a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml index c977eb9b7..b25607fff 100644 --- a/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml +++ b/helm-charts/aerospike-kubernetes-operator/crds/customresourcedefinition_aerospikeclusters.asdb.aerospike.com.yaml @@ -281,6 +281,12 @@ spec: - customInterface type: string type: object + enableDynamicUpdate: + description: EnableDynamicUpdate enables dynamic config update flow + of the operator. If enabled, operator will try to update the Aerospike + config dynamically. In case of inconsistent state during dynamic + config update, operator falls back to rolling restart. + type: boolean image: description: Aerospike server image type: string @@ -14238,10 +14244,15 @@ spec: items: type: string type: array - dynamicConfigFailed: - description: DynamicConfigFailed is true if aerospike config - change failed to apply dynamically. - type: boolean + dynamicConfigUpdateStatus: + description: DynamicConfigUpdateStatus is the status of dynamic + config update operation. Empty "" status means successful + update. + enum: + - Failed + - PartiallyFailed + - "" + type: string hostExternalIP: description: HostExternalIP of the K8s host this pod is scheduled on.