From 6272eaaec6bfdfe879fcf5d5a56672f576ac43e7 Mon Sep 17 00:00:00 2001 From: Harrison Affel Date: Tue, 22 Oct 2024 12:34:45 -0400 Subject: [PATCH] Do not update existing windows plans with new max-failures values --- pkg/capr/planner/etcdcreate.go | 2 +- pkg/capr/planner/etcdrestore.go | 2 +- pkg/capr/planner/planner.go | 18 +++++++++--------- pkg/capr/planner/store.go | 14 ++++++++++---- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pkg/capr/planner/etcdcreate.go b/pkg/capr/planner/etcdcreate.go index 05d94dc0288..57215f4b084 100644 --- a/pkg/capr/planner/etcdcreate.go +++ b/pkg/capr/planner/etcdcreate.go @@ -67,7 +67,7 @@ func (p *Planner) runEtcdSnapshotManagementServiceStart(controlPlane *rkev1.RKEC // Generate and deliver desired plan for the bootstrap/init node first. if err := p.reconcile(controlPlane, tokensSecret, clusterPlan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, "1", "", controlPlane.Spec.UpgradeStrategy.ControlPlaneDrainOptions, - -1, 1, false); err != nil { + -1, 1, false, true); err != nil { return err } diff --git a/pkg/capr/planner/etcdrestore.go b/pkg/capr/planner/etcdrestore.go index 2a0d98aa45c..77abe753db0 100644 --- a/pkg/capr/planner/etcdrestore.go +++ b/pkg/capr/planner/etcdrestore.go @@ -672,7 +672,7 @@ func (p *Planner) runEtcdRestoreServiceStop(controlPlane *rkev1.RKEControlPlane, stopPlan.Instructions = append(stopPlan.Instructions, generateRemoveTLSAndCredDirInstructions(controlPlane)...) } if !equality.Semantic.DeepEqual(server.Plan.Plan, stopPlan) { - if err := p.store.UpdatePlan(server, stopPlan, joinedServer, 0, 0); err != nil { + if err := p.store.UpdatePlan(server, stopPlan, joinedServer, 0, 0, true); err != nil { return err } updated = true diff --git a/pkg/capr/planner/planner.go b/pkg/capr/planner/planner.go index 970bfdbbc3e..ff66f663e2c 100644 --- a/pkg/capr/planner/planner.go +++ b/pkg/capr/planner/planner.go @@ -363,7 +363,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // select all etcd and then filter to just initNodes so that unavailable count is correct err = p.reconcile(cp, clusterSecretTokens, plan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, "1", "", controlPlaneDrainOptions, -1, 1, - false) + false, true) capr.Bootstrapped.True(&status) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { @@ -384,7 +384,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // Process all nodes that have the etcd role and are NOT an init node or deleting. Only process 1 node at a time. err = p.reconcile(cp, clusterSecretTokens, plan, true, etcdTier, isEtcd, isInitNodeOrDeleting, "1", joinServer, controlPlaneDrainOptions, - -1, 1, false) + -1, 1, false, true) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -393,7 +393,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // Process all nodes that have the controlplane role and are NOT an init node or deleting. err = p.reconcile(cp, clusterSecretTokens, plan, true, controlPlaneTier, isControlPlane, isInitNodeOrDeleting, controlPlaneConcurrency, joinServer, controlPlaneDrainOptions, -1, 1, - false) + false, true) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -413,7 +413,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // Process all nodes that are ONLY linux worker nodes. err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyLinuxWorker, isInitNodeOrDeleting, workerConcurrency, "", workerDrainOptions, -1, 1, - false) + false, true) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -438,7 +438,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyWindowsWorker, isInitNodeOrDeleting, workerConcurrency, "", workerDrainOptions, windowsMaxFailures, - windowsMaxFailureThreshold, resetFailureCountOnRestart) + windowsMaxFailureThreshold, resetFailureCountOnRestart, false) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -857,7 +857,7 @@ type reconcilable struct { func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, required bool, tierName string, include, exclude roleFilter, maxUnavailable, forcedJoinURL string, drainOptions rkev1.DrainOptions, - maxFailures, failureThreshold int, resetFailureCountOnSystemAgentRestart bool) error { + maxFailures, failureThreshold int, resetFailureCountOnSystemAgentRestart, overwriteFailureValues bool) error { var ( ready, outOfSync, nonReady, errMachines, draining, uncordoned []string messages = map[string][]string{} @@ -929,14 +929,14 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - setting initial plan for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - initial plan for machine %s/%s new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold, overwriteFailureValues); err != nil { return err } } else if r.minorChange { logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - minor plan change detected for machine %s/%s, updating plan immediately", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - minor plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold, overwriteFailureValues); err != nil { return err } } else if r.change { @@ -960,7 +960,7 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl // Drain is done (or didn't need to be done) and there are no errors, so the plan should be updated to enact the reason the node was drained. logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) - if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { + if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold, overwriteFailureValues); err != nil { return err } else if r.entry.Metadata.Annotations[capr.DrainDoneAnnotation] != "" { messages[r.entry.Machine.Name] = append(messages[r.entry.Machine.Name], "drain completed") diff --git a/pkg/capr/planner/store.go b/pkg/capr/planner/store.go index 4bcc6980a82..add2c92952b 100644 --- a/pkg/capr/planner/store.go +++ b/pkg/capr/planner/store.go @@ -357,7 +357,7 @@ func (p *PlanStore) getPlanSecretFromMachine(machine *capi.Machine) (*corev1.Sec // UpdatePlan should not be called directly as it will not block further progress if the plan is not in sync // maxFailures is the number of attempts the system-agent will make to run the plan (in a failed state). failureThreshold is used to determine when the plan has failed. -func (p *PlanStore) UpdatePlan(entry *planEntry, newNodePlan plan.NodePlan, joinedTo string, maxFailures, failureThreshold int) error { +func (p *PlanStore) UpdatePlan(entry *planEntry, newNodePlan plan.NodePlan, joinedTo string, maxFailures, failureThreshold int, overwriteFailureKeys bool) error { if maxFailures < failureThreshold && failureThreshold != -1 && maxFailures != -1 { return fmt.Errorf("failureThreshold (%d) cannot be greater than maxFailures (%d)", failureThreshold, maxFailures) } @@ -400,15 +400,21 @@ func (p *PlanStore) UpdatePlan(entry *planEntry, newNodePlan plan.NodePlan, join // If the plan is being updated, then delete the probe-statuses so their healthy status will be reported as healthy only when they pass. delete(secret.Data, "probe-statuses") + _, maxFailuresHasBeenSet := secret.Data["max-failures"] secret.Data["plan"] = data if maxFailures > 0 || maxFailures == -1 { - secret.Data["max-failures"] = []byte(strconv.Itoa(maxFailures)) + if !maxFailuresHasBeenSet || maxFailuresHasBeenSet && overwriteFailureKeys { + secret.Data["max-failures"] = []byte(strconv.Itoa(maxFailures)) + } } else { delete(secret.Data, "max-failures") } + _, failureThresholdHasBeenSet := secret.Data["failure-threshold"] if failureThreshold > 0 || failureThreshold == -1 { - secret.Data["failure-threshold"] = []byte(strconv.Itoa(failureThreshold)) + if !failureThresholdHasBeenSet || failureThresholdHasBeenSet && overwriteFailureKeys { + secret.Data["failure-threshold"] = []byte(strconv.Itoa(failureThreshold)) + } } else { delete(secret.Data, "failure-threshold") } @@ -477,7 +483,7 @@ func (p *PlanStore) removePlanSecretLabel(entry *planEntry, key string) error { // assignAndCheckPlan assigns the given newPlan to the designated server in the planEntry, and will return nil if the plan is assigned and in sync. func assignAndCheckPlan(store *PlanStore, msg string, entry *planEntry, newPlan plan.NodePlan, joinedTo string, failureThreshold, maxRetries int) error { if entry.Plan == nil || !equality.Semantic.DeepEqual(entry.Plan.Plan, newPlan) { - if err := store.UpdatePlan(entry, newPlan, joinedTo, failureThreshold, maxRetries); err != nil { + if err := store.UpdatePlan(entry, newPlan, joinedTo, failureThreshold, maxRetries, true); err != nil { return err } return errWaiting(fmt.Sprintf("starting %s", msg))