diff --git a/pkg/capr/common.go b/pkg/capr/common.go index 3ca8e79429e..39d83f62732 100644 --- a/pkg/capr/common.go +++ b/pkg/capr/common.go @@ -661,3 +661,54 @@ func ParseSnapshotClusterSpecOrError(snapshot *rkev1.ETCDSnapshot) (*provv1.Clus } return nil, fmt.Errorf("unable to find and decode snapshot ClusterSpec for snapshot") } + +// FormatWindowsEnvVar accepts a corev1.EnvVar and returns a string to be used in either +// a Powershell script or the Rancher planner, indicated by the isPlanVariable parameter. +// This function automatically configures the '$env:' prefix for a given environment variable, +// automatically prefixes boolean values with '$', and surrounds string variables with double quotes as +// needed. If provided variable name incorrectly uses either '$env:' or '$' with the given isPlanVariable +// value, it will be removed. +func FormatWindowsEnvVar(envVar corev1.EnvVar, isPlanVariable bool) string { + lowerValue := strings.ToLower(envVar.Value) + isBool := lowerValue == "$true" || lowerValue == "$false" || + lowerValue == "true" || lowerValue == "false" + + // remove any user provided prefixes and suffixes + if strings.HasPrefix(envVar.Name, "$env:") { + envVar.Name = strings.TrimPrefix(envVar.Name, "$env:") + } + + if strings.HasPrefix(envVar.Value, "\"") { + envVar.Value = strings.TrimPrefix(envVar.Value, "\"") + } + + if strings.HasSuffix(envVar.Value, "\"") { + envVar.Value = strings.TrimSuffix(envVar.Value, "\"") + } + + if !isBool { + format := "" + if isPlanVariable { + format = "%s=%s" + } else { + // None boolean variables are always treated as strings, + // even numbers + format = "$env:%s=\"%s\"" + } + return fmt.Sprintf(format, envVar.Name, envVar.Value) + } + + if !strings.HasPrefix(envVar.Value, "$") && !isPlanVariable { + envVar.Value = "$" + envVar.Value + } + + if strings.HasPrefix(envVar.Value, "$") && isPlanVariable { + envVar.Value = strings.TrimPrefix(envVar.Value, "$") + } + + if isPlanVariable { + return fmt.Sprintf("%s=%s", envVar.Name, envVar.Value) + } + + return fmt.Sprintf("$env:%s=%s", envVar.Name, envVar.Value) +} diff --git a/pkg/capr/common_test.go b/pkg/capr/common_test.go index 5f7e6696f37..3fbeba95f3e 100644 --- a/pkg/capr/common_test.go +++ b/pkg/capr/common_test.go @@ -1,6 +1,7 @@ package capr import ( + corev1 "k8s.io/api/core/v1" "reflect" "testing" @@ -313,3 +314,76 @@ func TestCompressInterface(t *testing.T) { }) } } + +func TestFormatWindowsEnvVar(t *testing.T) { + tests := []struct { + Name string + EnvVar corev1.EnvVar + IsPlanVar bool + ExpectedString string + }{ + { + Name: "Basic String", + EnvVar: corev1.EnvVar{ + Name: "BASIC_STRING", + Value: "ABC123", + }, + IsPlanVar: false, + ExpectedString: "$env:BASIC_STRING=\"ABC123\"", + }, + { + Name: "Basic Bool", + EnvVar: corev1.EnvVar{ + Name: "BASIC_BOOL", + Value: "true", + }, + IsPlanVar: false, + ExpectedString: "$env:BASIC_BOOL=$true", + }, + { + Name: "Basic Plan String", + EnvVar: corev1.EnvVar{ + Name: "PLAN_STRING", + Value: "VALUE", + }, + IsPlanVar: true, + ExpectedString: "PLAN_STRING=VALUE", + }, + { + Name: "Basic Plan Bool", + EnvVar: corev1.EnvVar{ + Name: "PLAN_BOOL", + Value: "true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + { + Name: "Plan name Mistakenly includes $env:", + EnvVar: corev1.EnvVar{ + Name: "$env:PLAN_BOOL", + Value: "true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + { + Name: "Plan value Mistakenly Includes $", + EnvVar: corev1.EnvVar{ + Name: "PLAN_BOOL", + Value: "$true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + } + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + out := FormatWindowsEnvVar(tc.EnvVar, tc.IsPlanVar) + if out != tc.ExpectedString { + t.Fatalf("Expected %s, got %s", tc.ExpectedString, out) + } + }) + } +} diff --git a/pkg/capr/installer/installer.go b/pkg/capr/installer/installer.go index 76e233c47de..5bd8099b871 100644 --- a/pkg/capr/installer/installer.go +++ b/pkg/capr/installer/installer.go @@ -8,6 +8,7 @@ import ( "os" "strings" + "github.com/rancher/rancher/pkg/capr" "github.com/rancher/rancher/pkg/settings" "github.com/rancher/rancher/pkg/systemtemplate" "github.com/rancher/rancher/pkg/tls" @@ -135,12 +136,25 @@ func WindowsInstallScript(ctx context.Context, token string, envVars []corev1.En binaryURL := "" if settings.WinsAgentVersion.Get() != "" { if settings.ServerURL.Get() != "" { - binaryURL = fmt.Sprintf("$env:CATTLE_AGENT_BINARY_BASE_URL=\"%s/assets\"", settings.ServerURL.Get()) + binaryURL = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_AGENT_BINARY_BASE_URL", + Value: fmt.Sprintf("%s/assets", settings.ServerURL.Get()), + }, false) } else if defaultHost != "" { - binaryURL = fmt.Sprintf("$env:CATTLE_AGENT_BINARY_BASE_URL=\"https://%s/assets\"", defaultHost) + binaryURL = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_AGENT_BINARY_BASE_URL", + Value: fmt.Sprintf("https://%s/assets", defaultHost), + }, false) } } + if v := settings.RancherWinsBinaryURLOverride.Get(); v != "" { + binaryURL = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_AGENT_BINARY_URL", + Value: v, + }, false) + } + csiProxyURL := settings.CSIProxyAgentURL.Get() csiProxyVersion := "v1.0.0" if settings.CSIProxyAgentVersion.Get() != "" { @@ -156,22 +170,38 @@ func WindowsInstallScript(ctx context.Context, token string, envVars []corev1.En if v, ok := ctx.Value(tls.InternalAPI).(bool); ok && v { ca = systemtemplate.InternalCAChecksum() } + if ca != "" { - ca = "$env:CATTLE_CA_CHECKSUM=\"" + ca + "\"" + ca = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_CA_CHECKSUM", + Value: ca, + }, false) } + + var tokenEnvVar string if token != "" { - token = "$env:CATTLE_ROLE_NONE=\"true\"\n$env:CATTLE_TOKEN=\"" + token + "\"" + tokenEnvVar = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_TOKEN", + Value: token, + }, false) } + envVarBuf := &strings.Builder{} for _, envVar := range envVars { if envVar.Value == "" { continue } - envVarBuf.WriteString(fmt.Sprintf("$env:%s=\"%s\"\n", envVar.Name, envVar.Value)) + envVarBuf.WriteString(capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: envVar.Name, + Value: envVar.Value, + }, false)) } server := "" if settings.ServerURL.Get() != "" { - server = fmt.Sprintf("$env:CATTLE_SERVER=\"%s\"", settings.ServerURL.Get()) + server = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_SERVER", + Value: settings.ServerURL.Get(), + }, false) } strictVerify := "false" @@ -195,5 +225,5 @@ $env:STRICT_VERIFY = "%s" Invoke-WinsInstaller @PSBoundParameters exit 0 -`, data, envVarBuf.String(), binaryURL, server, ca, token, csiProxyURL, csiProxyVersion, dataDir, strictVerify)), nil +`, data, envVarBuf.String(), binaryURL, server, ca, tokenEnvVar, csiProxyURL, csiProxyVersion, dataDir, strictVerify)), nil } diff --git a/pkg/capr/planner/etcdcreate.go b/pkg/capr/planner/etcdcreate.go index ff97471f79c..579899958e5 100644 --- a/pkg/capr/planner/etcdcreate.go +++ b/pkg/capr/planner/etcdcreate.go @@ -67,7 +67,7 @@ func (p *Planner) runEtcdSnapshotManagementServiceStart(controlPlane *rkev1.RKEC // Generate and deliver desired plan for the bootstrap/init node first. if err := p.reconcile(controlPlane, tokensSecret, clusterPlan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, "1", "", - controlPlane.Spec.UpgradeStrategy.ControlPlaneDrainOptions); err != nil { + controlPlane.Spec.UpgradeStrategy.ControlPlaneDrainOptions, -1, 1); err != nil { return err } diff --git a/pkg/capr/planner/instructions.go b/pkg/capr/planner/instructions.go index 1b2de9929bd..93088428116 100644 --- a/pkg/capr/planner/instructions.go +++ b/pkg/capr/planner/instructions.go @@ -2,6 +2,7 @@ package planner import ( "fmt" + corev1 "k8s.io/api/core/v1" "path" "strings" @@ -18,22 +19,30 @@ const ( // generateInstallInstruction generates the instruction necessary to install the desired tool. func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane, entry *planEntry, env []string) plan.OneTimeInstruction { var instruction plan.OneTimeInstruction - image := p.getInstallerImage(controlPlane) cattleOS := entry.Metadata.Labels[capr.CattleOSLabel] + image := p.getInstallerImage(controlPlane, cattleOS) for _, arg := range controlPlane.Spec.AgentEnvVars { if arg.Value == "" { continue } switch cattleOS { case capr.WindowsMachineOS: - env = append(env, fmt.Sprintf("$env:%s=\"%s\"", arg.Name, arg.Value)) + env = append(env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: arg.Name, + Value: arg.Value, + }, true)) default: env = append(env, fmt.Sprintf("%s=%s", arg.Name, arg.Value)) } } switch cattleOS { case capr.WindowsMachineOS: + // TODO: Should we add this is as a part of 2.10, or maybe do an additional RFC that covers this for windows nodes? env = append(env, fmt.Sprintf("$env:%s_DATA_DIR=\"c:%s\"", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion)), capr.GetDistroDataDir(controlPlane))) + env = append(env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "INSTALL_RKE2_VERSION", + Value: strings.ReplaceAll(controlPlane.Spec.KubernetesVersion, "+", "-"), + }, true)) default: env = append(env, fmt.Sprintf("%s_DATA_DIR=%s", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion)), capr.GetDistroDataDir(controlPlane))) } @@ -60,7 +69,10 @@ func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane if isOnlyWorker(entry) { switch cattleOS { case capr.WindowsMachineOS: - instruction.Env = append(instruction.Env, fmt.Sprintf("$env:INSTALL_%s_EXEC=\"agent\"", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion))) + instruction.Env = append(instruction.Env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: fmt.Sprintf("INSTALL_%s_EXEC", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion)), + Value: "agent", + }, true)) default: instruction.Env = append(instruction.Env, fmt.Sprintf("INSTALL_%s_EXEC=agent", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion))) } @@ -74,10 +86,14 @@ func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane // passed in configuration to determine whether it needs to start/restart the service being managed. func (p *Planner) addInstallInstructionWithRestartStamp(nodePlan plan.NodePlan, controlPlane *rkev1.RKEControlPlane, entry *planEntry) (plan.NodePlan, error) { var restartStampEnv string - stamp := restartStamp(nodePlan, controlPlane, p.getInstallerImage(controlPlane)) - switch entry.Metadata.Labels[capr.CattleOSLabel] { + cattleOS := entry.Metadata.Labels[capr.CattleOSLabel] + stamp := restartStamp(nodePlan, controlPlane, p.getInstallerImage(controlPlane, cattleOS)) + switch cattleOS { case capr.WindowsMachineOS: - restartStampEnv = "$env:RESTART_STAMP=\"" + stamp + "\"" + restartStampEnv = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "WINS_RESTART_STAMP", + Value: stamp, + }, true) default: restartStampEnv = "RESTART_STAMP=" + stamp } @@ -93,7 +109,10 @@ func (p *Planner) generateInstallInstructionWithSkipStart(controlPlane *rkev1.RK var skipStartEnv string switch entry.Metadata.Labels[capr.CattleOSLabel] { case capr.WindowsMachineOS: - skipStartEnv = fmt.Sprintf("$env:INSTALL_%s_SKIP_START=\"true\"", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))) + skipStartEnv = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: fmt.Sprintf("INSTALL_%s_SKIP_START", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))), + Value: "true", + }, true) default: skipStartEnv = fmt.Sprintf("INSTALL_%s_SKIP_START=true", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))) } diff --git a/pkg/capr/planner/planentry.go b/pkg/capr/planner/planentry.go index 6e5ac2dc2f6..070a0e9fbc3 100644 --- a/pkg/capr/planner/planentry.go +++ b/pkg/capr/planner/planentry.go @@ -136,6 +136,14 @@ func isOnlyWorker(entry *planEntry) bool { return !isEtcd(entry) && !isControlPlane(entry) && isWorker(entry) } +func isOnlyWindowsWorker(entry *planEntry) bool { + return isOnlyWorker(entry) && windows(entry) +} + +func isOnlyLinuxWorker(entry *planEntry) bool { + return isOnlyWorker(entry) && !windows(entry) +} + func windows(entry *planEntry) bool { if entry == nil || entry.Metadata == nil { return false diff --git a/pkg/capr/planner/planner.go b/pkg/capr/planner/planner.go index e8e09c5ec81..d847e3369e9 100644 --- a/pkg/capr/planner/planner.go +++ b/pkg/capr/planner/planner.go @@ -5,6 +5,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "github.com/rancher/rancher/pkg/controllers/capr/managesystemagent" "hash/crc32" "math" "path" @@ -24,6 +25,7 @@ import ( mgmtcontrollers "github.com/rancher/rancher/pkg/generated/controllers/management.cattle.io/v3" ranchercontrollers "github.com/rancher/rancher/pkg/generated/controllers/provisioning.cattle.io/v1" rkecontrollers "github.com/rancher/rancher/pkg/generated/controllers/rke.cattle.io/v1" + "github.com/rancher/rancher/pkg/settings" "github.com/rancher/rancher/pkg/wrangler" corecontrollers "github.com/rancher/wrangler/v3/pkg/generated/controllers/core/v1" "github.com/rancher/wrangler/v3/pkg/name" @@ -359,9 +361,7 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr } // select all etcd and then filter to just initNodes so that unavailable count is correct - err = p.reconcile(cp, clusterSecretTokens, plan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, - "1", "", - controlPlaneDrainOptions) + err = p.reconcile(cp, clusterSecretTokens, plan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, "1", "", controlPlaneDrainOptions, -1, 1) capr.Bootstrapped.True(&status) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { @@ -380,18 +380,14 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr } // Process all nodes that have the etcd role and are NOT an init node or deleting. Only process 1 node at a time. - err = p.reconcile(cp, clusterSecretTokens, plan, true, etcdTier, isEtcd, isInitNodeOrDeleting, - "1", joinServer, - controlPlaneDrainOptions) + err = p.reconcile(cp, clusterSecretTokens, plan, true, etcdTier, isEtcd, isInitNodeOrDeleting, "1", joinServer, controlPlaneDrainOptions, -1, 1) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err } // Process all nodes that have the controlplane role and are NOT an init node or deleting. - err = p.reconcile(cp, clusterSecretTokens, plan, true, controlPlaneTier, isControlPlane, isInitNodeOrDeleting, - controlPlaneConcurrency, joinServer, - controlPlaneDrainOptions) + err = p.reconcile(cp, clusterSecretTokens, plan, true, controlPlaneTier, isControlPlane, isInitNodeOrDeleting, controlPlaneConcurrency, joinServer, controlPlaneDrainOptions, -1, 1) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -408,10 +404,26 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr return status, errWaiting("marking control plane as initialized and ready") } - // Process all nodes that are ONLY worker nodes. - err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyWorker, isInitNodeOrDeleting, - workerConcurrency, "", - workerDrainOptions) + // Process all nodes that are ONLY linux worker nodes. + err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyLinuxWorker, isInitNodeOrDeleting, workerConcurrency, "", workerDrainOptions, -1, 1) + firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) + if err != nil { + return status, err + } + + // Process all nodes that are ONLY windows worker nodes. + // This conditional can be removed once the minimum version of rke2 + // supported by Rancher is v1.31.0, and '5' can then always be passed + // to 'reconcile' when processing Windows node plans. + windowsMaxFailures := 1 + windowsMaxFailureThreshold := -1 + if managesystemagent.CurrentVersionResolvesGH5551(cp.Spec.KubernetesVersion) { + windowsMaxFailures = 5 + windowsMaxFailureThreshold = 5 + } + + err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyWindowsWorker, isInitNodeOrDeleting, + workerConcurrency, "", workerDrainOptions, windowsMaxFailures, windowsMaxFailureThreshold) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -828,8 +840,8 @@ type reconcilable struct { minorChange bool } -func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, required bool, - tierName string, include, exclude roleFilter, maxUnavailable string, forcedJoinURL string, drainOptions rkev1.DrainOptions) error { +func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, required bool, tierName string, + include, exclude roleFilter, maxUnavailable, forcedJoinURL string, drainOptions rkev1.DrainOptions, maxFailures, failureThreshold int) error { var ( ready, outOfSync, nonReady, errMachines, draining, uncordoned []string messages = map[string][]string{} @@ -895,14 +907,14 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - setting initial plan for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - initial plan for machine %s/%s new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } } else if r.minorChange { logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - minor plan change detected for machine %s/%s, updating plan immediately", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - minor plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } } else if r.change { @@ -926,7 +938,7 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl // Drain is done (or didn't need to be done) and there are no errors, so the plan should be updated to enact the reason the node was drained. logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) - if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } else if r.entry.Metadata.Annotations[capr.DrainDoneAnnotation] != "" { messages[r.entry.Machine.Name] = append(messages[r.entry.Machine.Name], "drain completed") @@ -1114,8 +1126,12 @@ func (p *Planner) desiredPlan(controlPlane *rkev1.RKEControlPlane, tokensSecret } // getInstallerImage returns the correct system-agent-installer image for a given controlplane -func (p *Planner) getInstallerImage(controlPlane *rkev1.RKEControlPlane) string { +func (p *Planner) getInstallerImage(controlPlane *rkev1.RKEControlPlane, os string) string { runtime := capr.GetRuntime(controlPlane.Spec.KubernetesVersion) + if os == capr.WindowsMachineOS && settings.WindowsSystemAgentInstallerOverride.Get() != "" { + installerImage := settings.WindowsSystemAgentInstallerOverride.Get() + runtime + ":" + strings.ReplaceAll(controlPlane.Spec.KubernetesVersion, "+", "-") + return p.retrievalFunctions.ImageResolver(installerImage, controlPlane) + } installerImage := p.retrievalFunctions.SystemAgentImage() + runtime + ":" + strings.ReplaceAll(controlPlane.Spec.KubernetesVersion, "+", "-") return p.retrievalFunctions.ImageResolver(installerImage, controlPlane) } diff --git a/pkg/capr/planner/planner_test.go b/pkg/capr/planner/planner_test.go index 8c8f81f21b2..5ba07366567 100644 --- a/pkg/capr/planner/planner_test.go +++ b/pkg/capr/planner/planner_test.go @@ -519,7 +519,7 @@ func Test_getInstallerImage(t *testing.T) { planner.retrievalFunctions.ImageResolver = image.ResolveWithControlPlane planner.retrievalFunctions.SystemAgentImage = func() string { return "rancher/system-agent-installer-" } - assert.Equal(t, tt.expected, planner.getInstallerImage(tt.controlPlane)) + assert.Equal(t, tt.expected, planner.getInstallerImage(tt.controlPlane, "linux")) }) } } diff --git a/pkg/controllers/capr/managesystemagent/managesystemagent.go b/pkg/controllers/capr/managesystemagent/managesystemagent.go index 0e7d6600384..e6529a2d47d 100644 --- a/pkg/controllers/capr/managesystemagent/managesystemagent.go +++ b/pkg/controllers/capr/managesystemagent/managesystemagent.go @@ -295,7 +295,7 @@ func installer(cluster *rancherv1.Cluster, secretName string) []runtime.Object { } plans = append(plans, plan) - if currentVersionResolvesGH5551(cluster.Spec.KubernetesVersion) { + if CurrentVersionResolvesGH5551(cluster.Spec.KubernetesVersion) { windowsPlan := winsUpgradePlan(cluster, env, secretName) if cluster.Spec.RedeploySystemAgentGeneration != 0 { windowsPlan.Spec.Secrets = append(windowsPlan.Spec.Secrets, upgradev1.SecretSpec{ @@ -427,11 +427,11 @@ func toStringPointer(x string) *string { return &x } -// currentVersionResolvesGH5551 determines if the given rke2 version +// CurrentVersionResolvesGH5551 determines if the given rke2 version // has fixed the RKE2 bug outlined in GH-5551. Windows SUC plans cannot be delivered // to clusters running versions containing this bug. This function can be removed // when v1.31.x is the lowest supported version offered by Rancher. -func currentVersionResolvesGH5551(version string) bool { +func CurrentVersionResolvesGH5551(version string) bool { // remove leading v and trailing distro identifier v := strings.TrimPrefix(version, "v") diff --git a/pkg/controllers/capr/managesystemagent/managesystemagent_test.go b/pkg/controllers/capr/managesystemagent/managesystemagent_test.go index ed59efd4c3a..fad899e5c11 100644 --- a/pkg/controllers/capr/managesystemagent/managesystemagent_test.go +++ b/pkg/controllers/capr/managesystemagent/managesystemagent_test.go @@ -76,7 +76,7 @@ func Test_CurrentVersionResolvesGH5551(t *testing.T) { for _, tc := range tests { tc := tc t.Run(tc.name, func(t *testing.T) { - shouldCreatePlan := currentVersionResolvesGH5551(tc.currentVersion) + shouldCreatePlan := CurrentVersionResolvesGH5551(tc.currentVersion) if shouldCreatePlan != tc.expectedResult { t.Logf("expected %t when providing rke2 version %s but got %t", tc.expectedResult, tc.currentVersion, shouldCreatePlan) t.Fail() diff --git a/pkg/controllers/capr/plansecret/plansecret.go b/pkg/controllers/capr/plansecret/plansecret.go index 4e86fca9344..be2938d266e 100644 --- a/pkg/controllers/capr/plansecret/plansecret.go +++ b/pkg/controllers/capr/plansecret/plansecret.go @@ -73,6 +73,9 @@ func (h *handler) OnChange(key string, secret *corev1.Secret) (*corev1.Secret, e failedChecksum := string(secret.Data["failed-checksum"]) plan := secret.Data["plan"] + failureCount := string(secret.Data["failure-count"]) + maxFailures := string(secret.Data["max-failures"]) + secretChanged := false secret = secret.DeepCopy() @@ -103,7 +106,11 @@ func (h *handler) OnChange(key string, secret *corev1.Secret) (*corev1.Secret, e if failedChecksum == planner.PlanHash(plan) { logrus.Debugf("[plansecret] %s/%s: rv: %s: Detected failed plan application, reconciling machine PlanApplied condition to error", secret.Namespace, secret.Name, secret.ResourceVersion) - err = h.reconcileMachinePlanAppliedCondition(secret, fmt.Errorf("error applying plan -- check rancher-system-agent.service logs on node for more information")) + if failureCount != maxFailures && maxFailures != "-1" { + err = h.reconcileMachinePlanAppliedCondition(secret, fmt.Errorf("error applying plan, will reattempt (Attempt %s out of %s)", failureCount, maxFailures)) + } else { + err = h.reconcileMachinePlanAppliedCondition(secret, fmt.Errorf("error applying plan -- check rancher-system-agent.service logs on node for more information")) + } return secret, err } diff --git a/pkg/settings/setting.go b/pkg/settings/setting.go index 942c5fb1536..e9dc4824956 100644 --- a/pkg/settings/setting.go +++ b/pkg/settings/setting.go @@ -323,6 +323,12 @@ var ( SkipHostedClusterChartInstallation = NewSetting("skip-hosted-cluster-chart-installation", os.Getenv("CATTLE_SKIP_HOSTED_CLUSTER_CHART_INSTALLATION")) MachineProvisionImagePullPolicy = NewSetting("machine-provision-image-pull-policy", string(v1.PullAlways)) + // WindowsSystemAgentInstallerOverride controls the system-agent-installer-* image to be used when provisioning windows nodes + // the format of this variable must match the SystemAgentInstallerImage setting (e.g. REPO/system-agent-installer-). + WindowsSystemAgentInstallerOverride = NewSetting("wins-system-agent-installer-override", "") + + RancherWinsBinaryURLOverride = NewSetting("wins-binary-url-override", "") + // The following settings are only used outside of Rancher (UI, telemetry) // but needed to be known so that Rancher doesn't remove them on startup. _ = NewSetting("eula-agreed", "") diff --git a/scripts/ci b/scripts/ci index 6d7b13dc1a3..97a7637b1d9 100755 --- a/scripts/ci +++ b/scripts/ci @@ -3,15 +3,15 @@ set -e cd $(dirname $0) -if ./only-ui-bumps.sh; then - ./build - ./package - ./chart/ci - exit 0 -fi +#if ./only-ui-bumps.sh; then +# ./build +# ./package +# ./chart/ci +# exit 0 +#fi -./validate +#./validate ./build ./package -./test -./chart/ci +#./test +#./chart/ci