From 7f259ef43ef989ed2a9ae074ee235b6f6c39581a Mon Sep 17 00:00:00 2001 From: Harrison Affel Date: Thu, 10 Oct 2024 11:58:05 -0400 Subject: [PATCH] Properly format windows env-vars, redeploy windows plans if they fail up to 5 times --- pkg/apis/rke.cattle.io/v1/plan/plan.go | 13 ++-- pkg/capr/common.go | 51 +++++++++++++ pkg/capr/common_test.go | 74 +++++++++++++++++++ pkg/capr/installer/installer.go | 38 ++++++++-- pkg/capr/planner/etcdcreate.go | 6 +- pkg/capr/planner/etcdrestore.go | 6 +- pkg/capr/planner/instructions.go | 56 +++++++++----- pkg/capr/planner/instructions_test.go | 22 +++--- pkg/capr/planner/planentry.go | 8 ++ pkg/capr/planner/planner.go | 59 ++++++++++----- pkg/capr/planner/planner_test.go | 14 ++-- .../managesystemagent/managesystemagent.go | 6 +- .../managesystemagent_test.go | 2 +- pkg/controllers/capr/plansecret/plansecret.go | 7 +- 14 files changed, 285 insertions(+), 77 deletions(-) diff --git a/pkg/apis/rke.cattle.io/v1/plan/plan.go b/pkg/apis/rke.cattle.io/v1/plan/plan.go index 068f3a2049d..04ca64d2275 100644 --- a/pkg/apis/rke.cattle.io/v1/plan/plan.go +++ b/pkg/apis/rke.cattle.io/v1/plan/plan.go @@ -47,12 +47,13 @@ type Secret struct { } type OneTimeInstruction struct { - Name string `json:"name,omitempty"` - Image string `json:"image,omitempty"` - Env []string `json:"env,omitempty"` - Args []string `json:"args,omitempty"` - Command string `json:"command,omitempty"` - SaveOutput bool `json:"saveOutput,omitempty"` + Name string `json:"name,omitempty"` + Image string `json:"image,omitempty"` + Env []string `json:"env,omitempty"` + Args []string `json:"args,omitempty"` + Command string `json:"command,omitempty"` + SaveOutput bool `json:"saveOutput,omitempty"` + ResetFailureCountOnServiceRestart bool `json:"resetFailuresOnServiceRestart,omitempty"` } type PeriodicInstruction struct { diff --git a/pkg/capr/common.go b/pkg/capr/common.go index 75b3a4cba86..780f1fbbd7f 100644 --- a/pkg/capr/common.go +++ b/pkg/capr/common.go @@ -674,3 +674,54 @@ func PreBootstrap(mgmtCluster *v3.Cluster) bool { return !v3.ClusterConditionPreBootstrapped.IsTrue(mgmtCluster) } + +// FormatWindowsEnvVar accepts a corev1.EnvVar and returns a string to be used in either +// a Powershell script or the Rancher planner, indicated by the isPlanVariable parameter. +// This function automatically configures the '$env:' prefix for a given environment variable, +// automatically prefixes boolean values with '$', and surrounds string variables with double quotes as +// needed. If the provided variable name incorrectly uses either '$env:' or '$' for the given isPlanVariable +// value, it will be removed. +func FormatWindowsEnvVar(envVar corev1.EnvVar, isPlanVariable bool) string { + lowerValue := strings.ToLower(envVar.Value) + isBool := lowerValue == "$true" || lowerValue == "$false" || + lowerValue == "true" || lowerValue == "false" + + // remove any user provided prefixes and suffixes + if strings.HasPrefix(envVar.Name, "$env:") { + envVar.Name = strings.TrimPrefix(envVar.Name, "$env:") + } + + if strings.HasPrefix(envVar.Value, "\"") { + envVar.Value = strings.TrimPrefix(envVar.Value, "\"") + } + + if strings.HasSuffix(envVar.Value, "\"") { + envVar.Value = strings.TrimSuffix(envVar.Value, "\"") + } + + if !isBool { + format := "" + if isPlanVariable { + format = "%s=%s" + } else { + // Non-boolean variables are always treated as strings, + // even numbers + format = "$env:%s=\"%s\"" + } + return fmt.Sprintf(format, envVar.Name, envVar.Value) + } + + if !strings.HasPrefix(envVar.Value, "$") && !isPlanVariable { + envVar.Value = "$" + envVar.Value + } + + if strings.HasPrefix(envVar.Value, "$") && isPlanVariable { + envVar.Value = strings.TrimPrefix(envVar.Value, "$") + } + + if isPlanVariable { + return fmt.Sprintf("%s=%s", envVar.Name, envVar.Value) + } + + return fmt.Sprintf("$env:%s=%s", envVar.Name, envVar.Value) +} diff --git a/pkg/capr/common_test.go b/pkg/capr/common_test.go index 5f7e6696f37..3fbeba95f3e 100644 --- a/pkg/capr/common_test.go +++ b/pkg/capr/common_test.go @@ -1,6 +1,7 @@ package capr import ( + corev1 "k8s.io/api/core/v1" "reflect" "testing" @@ -313,3 +314,76 @@ func TestCompressInterface(t *testing.T) { }) } } + +func TestFormatWindowsEnvVar(t *testing.T) { + tests := []struct { + Name string + EnvVar corev1.EnvVar + IsPlanVar bool + ExpectedString string + }{ + { + Name: "Basic String", + EnvVar: corev1.EnvVar{ + Name: "BASIC_STRING", + Value: "ABC123", + }, + IsPlanVar: false, + ExpectedString: "$env:BASIC_STRING=\"ABC123\"", + }, + { + Name: "Basic Bool", + EnvVar: corev1.EnvVar{ + Name: "BASIC_BOOL", + Value: "true", + }, + IsPlanVar: false, + ExpectedString: "$env:BASIC_BOOL=$true", + }, + { + Name: "Basic Plan String", + EnvVar: corev1.EnvVar{ + Name: "PLAN_STRING", + Value: "VALUE", + }, + IsPlanVar: true, + ExpectedString: "PLAN_STRING=VALUE", + }, + { + Name: "Basic Plan Bool", + EnvVar: corev1.EnvVar{ + Name: "PLAN_BOOL", + Value: "true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + { + Name: "Plan name Mistakenly includes $env:", + EnvVar: corev1.EnvVar{ + Name: "$env:PLAN_BOOL", + Value: "true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + { + Name: "Plan value Mistakenly Includes $", + EnvVar: corev1.EnvVar{ + Name: "PLAN_BOOL", + Value: "$true", + }, + IsPlanVar: true, + ExpectedString: "PLAN_BOOL=true", + }, + } + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + out := FormatWindowsEnvVar(tc.EnvVar, tc.IsPlanVar) + if out != tc.ExpectedString { + t.Fatalf("Expected %s, got %s", tc.ExpectedString, out) + } + }) + } +} diff --git a/pkg/capr/installer/installer.go b/pkg/capr/installer/installer.go index 76e233c47de..15c651cf41a 100644 --- a/pkg/capr/installer/installer.go +++ b/pkg/capr/installer/installer.go @@ -8,6 +8,7 @@ import ( "os" "strings" + "github.com/rancher/rancher/pkg/capr" "github.com/rancher/rancher/pkg/settings" "github.com/rancher/rancher/pkg/systemtemplate" "github.com/rancher/rancher/pkg/tls" @@ -135,9 +136,15 @@ func WindowsInstallScript(ctx context.Context, token string, envVars []corev1.En binaryURL := "" if settings.WinsAgentVersion.Get() != "" { if settings.ServerURL.Get() != "" { - binaryURL = fmt.Sprintf("$env:CATTLE_AGENT_BINARY_BASE_URL=\"%s/assets\"", settings.ServerURL.Get()) + binaryURL = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_AGENT_BINARY_BASE_URL", + Value: fmt.Sprintf("%s/assets", settings.ServerURL.Get()), + }, false) } else if defaultHost != "" { - binaryURL = fmt.Sprintf("$env:CATTLE_AGENT_BINARY_BASE_URL=\"https://%s/assets\"", defaultHost) + binaryURL = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_AGENT_BINARY_BASE_URL", + Value: fmt.Sprintf("https://%s/assets", defaultHost), + }, false) } } @@ -156,22 +163,38 @@ func WindowsInstallScript(ctx context.Context, token string, envVars []corev1.En if v, ok := ctx.Value(tls.InternalAPI).(bool); ok && v { ca = systemtemplate.InternalCAChecksum() } + if ca != "" { - ca = "$env:CATTLE_CA_CHECKSUM=\"" + ca + "\"" + ca = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_CA_CHECKSUM", + Value: ca, + }, false) } + + var tokenEnvVar string if token != "" { - token = "$env:CATTLE_ROLE_NONE=\"true\"\n$env:CATTLE_TOKEN=\"" + token + "\"" + tokenEnvVar = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_TOKEN", + Value: token, + }, false) } + envVarBuf := &strings.Builder{} for _, envVar := range envVars { if envVar.Value == "" { continue } - envVarBuf.WriteString(fmt.Sprintf("$env:%s=\"%s\"\n", envVar.Name, envVar.Value)) + envVarBuf.WriteString(capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: envVar.Name, + Value: envVar.Value, + }, false)) } server := "" if settings.ServerURL.Get() != "" { - server = fmt.Sprintf("$env:CATTLE_SERVER=\"%s\"", settings.ServerURL.Get()) + server = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "CATTLE_SERVER", + Value: settings.ServerURL.Get(), + }, false) } strictVerify := "false" @@ -192,8 +215,9 @@ $env:CSI_PROXY_URL = "%s" $env:CSI_PROXY_VERSION = "%s" $env:CSI_PROXY_KUBELET_PATH = "C:%s/bin/kubelet.exe" $env:STRICT_VERIFY = "%s" +$env:CATTLE_ROLE_NONE="true" Invoke-WinsInstaller @PSBoundParameters exit 0 -`, data, envVarBuf.String(), binaryURL, server, ca, token, csiProxyURL, csiProxyVersion, dataDir, strictVerify)), nil +`, data, envVarBuf.String(), binaryURL, server, ca, tokenEnvVar, csiProxyURL, csiProxyVersion, dataDir, strictVerify)), nil } diff --git a/pkg/capr/planner/etcdcreate.go b/pkg/capr/planner/etcdcreate.go index ff97471f79c..bf3fed3fc00 100644 --- a/pkg/capr/planner/etcdcreate.go +++ b/pkg/capr/planner/etcdcreate.go @@ -65,9 +65,7 @@ func (p *Planner) runEtcdSnapshotCreate(controlPlane *rkev1.RKEControlPlane, tok // Notably, this function will blatantly ignore drain and concurrency options, as during an etcd snapshot operation, there is no necessity to drain nodes. func (p *Planner) runEtcdSnapshotManagementServiceStart(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, include roleFilter, operation string) error { // Generate and deliver desired plan for the bootstrap/init node first. - if err := p.reconcile(controlPlane, tokensSecret, clusterPlan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, - "1", "", - controlPlane.Spec.UpgradeStrategy.ControlPlaneDrainOptions); err != nil { + if err := p.reconcile(controlPlane, tokensSecret, clusterPlan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, "1", "", controlPlane.Spec.UpgradeStrategy.ControlPlaneDrainOptions, -1, 1, false); err != nil { return err } @@ -84,7 +82,7 @@ func (p *Planner) runEtcdSnapshotManagementServiceStart(controlPlane *rkev1.RKEC if isInitNodeOrDeleting(entry) { continue } - plan, joinedServer, err := p.desiredPlan(controlPlane, tokensSecret, entry, joinServer) + plan, joinedServer, err := p.desiredPlan(controlPlane, tokensSecret, entry, joinServer, false) if err != nil { return err } diff --git a/pkg/capr/planner/etcdrestore.go b/pkg/capr/planner/etcdrestore.go index 2a0d98aa45c..798fbb19076 100644 --- a/pkg/capr/planner/etcdrestore.go +++ b/pkg/capr/planner/etcdrestore.go @@ -230,7 +230,7 @@ func (p *Planner) runEtcdSnapshotPostRestorePodCleanupPlan(controlPlane *rkev1.R } initNode := initNodes[0] - initNodePlan, _, err := p.desiredPlan(controlPlane, tokensSecret, initNode, "") + initNodePlan, _, err := p.desiredPlan(controlPlane, tokensSecret, initNode, "", false) if err != nil { return err } @@ -262,7 +262,7 @@ func (p *Planner) runEtcdSnapshotPostRestorePodCleanupPlan(controlPlane *rkev1.R controlPlaneEntry := controlPlaneEntries[0] - firstControlPlanePlan, joinedServer, err := p.desiredPlan(controlPlane, tokensSecret, controlPlaneEntry, joinServer) + firstControlPlanePlan, joinedServer, err := p.desiredPlan(controlPlane, tokensSecret, controlPlaneEntry, joinServer, false) if err != nil { return err } @@ -280,7 +280,7 @@ func (p *Planner) runEtcdSnapshotPostRestoreNodeCleanupPlan(controlPlane *rkev1. } initNode := initNodes[0] - initNodePlan, _, err := p.desiredPlan(controlPlane, tokensSecret, initNode, "") + initNodePlan, _, err := p.desiredPlan(controlPlane, tokensSecret, initNode, "", false) if err != nil { return err } diff --git a/pkg/capr/planner/instructions.go b/pkg/capr/planner/instructions.go index 1b2de9929bd..532cf374955 100644 --- a/pkg/capr/planner/instructions.go +++ b/pkg/capr/planner/instructions.go @@ -8,6 +8,7 @@ import ( rkev1 "github.com/rancher/rancher/pkg/apis/rke.cattle.io/v1" "github.com/rancher/rancher/pkg/apis/rke.cattle.io/v1/plan" "github.com/rancher/rancher/pkg/capr" + corev1 "k8s.io/api/core/v1" ) const ( @@ -16,7 +17,7 @@ const ( ) // generateInstallInstruction generates the instruction necessary to install the desired tool. -func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane, entry *planEntry, env []string) plan.OneTimeInstruction { +func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane, entry *planEntry, env []string, resetFailureCountOnSystemAgentRestart bool) plan.OneTimeInstruction { var instruction plan.OneTimeInstruction image := p.getInstallerImage(controlPlane) cattleOS := entry.Metadata.Labels[capr.CattleOSLabel] @@ -26,14 +27,22 @@ func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane } switch cattleOS { case capr.WindowsMachineOS: - env = append(env, fmt.Sprintf("$env:%s=\"%s\"", arg.Name, arg.Value)) + env = append(env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: arg.Name, + Value: arg.Value, + }, true)) default: env = append(env, fmt.Sprintf("%s=%s", arg.Name, arg.Value)) } } switch cattleOS { case capr.WindowsMachineOS: + // TODO: Properly format the data dir when adding full support for Windows nodes env = append(env, fmt.Sprintf("$env:%s_DATA_DIR=\"c:%s\"", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion)), capr.GetDistroDataDir(controlPlane))) + env = append(env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "INSTALL_RKE2_VERSION", + Value: strings.ReplaceAll(controlPlane.Spec.KubernetesVersion, "+", "-"), + }, true)) default: env = append(env, fmt.Sprintf("%s_DATA_DIR=%s", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion)), capr.GetDistroDataDir(controlPlane))) } @@ -41,26 +50,31 @@ func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane switch cattleOS { case capr.WindowsMachineOS: instruction = plan.OneTimeInstruction{ - Name: "install", - Image: image, - Command: "powershell.exe", - Args: []string{"-File", "run.ps1"}, - Env: env, + Name: "install", + Image: image, + Command: "powershell.exe", + Args: []string{"-File", "run.ps1"}, + ResetFailureCountOnServiceRestart: resetFailureCountOnSystemAgentRestart, + Env: env, } default: instruction = plan.OneTimeInstruction{ - Name: "install", - Image: image, - Command: "sh", - Args: []string{"-c", "run.sh"}, - Env: env, + Name: "install", + Image: image, + Command: "sh", + Args: []string{"-c", "run.sh"}, + ResetFailureCountOnServiceRestart: resetFailureCountOnSystemAgentRestart, + Env: env, } } if isOnlyWorker(entry) { switch cattleOS { case capr.WindowsMachineOS: - instruction.Env = append(instruction.Env, fmt.Sprintf("$env:INSTALL_%s_EXEC=\"agent\"", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion))) + instruction.Env = append(instruction.Env, capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: fmt.Sprintf("INSTALL_%s_EXEC", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion)), + Value: "agent", + }, true)) default: instruction.Env = append(instruction.Env, fmt.Sprintf("INSTALL_%s_EXEC=agent", capr.GetRuntimeEnv(controlPlane.Spec.KubernetesVersion))) } @@ -72,17 +86,20 @@ func (p *Planner) generateInstallInstruction(controlPlane *rkev1.RKEControlPlane // addInstallInstructionWithRestartStamp will generate an instruction and append it to the node plan that executes the `run.sh` or `run.ps1` // from the installer image based on the control plane configuration. It will generate a restart stamp based on the // passed in configuration to determine whether it needs to start/restart the service being managed. -func (p *Planner) addInstallInstructionWithRestartStamp(nodePlan plan.NodePlan, controlPlane *rkev1.RKEControlPlane, entry *planEntry) (plan.NodePlan, error) { +func (p *Planner) addInstallInstructionWithRestartStamp(nodePlan plan.NodePlan, controlPlane *rkev1.RKEControlPlane, entry *planEntry, resetFailureCountOnSystemAgentRestart bool) (plan.NodePlan, error) { var restartStampEnv string stamp := restartStamp(nodePlan, controlPlane, p.getInstallerImage(controlPlane)) switch entry.Metadata.Labels[capr.CattleOSLabel] { case capr.WindowsMachineOS: - restartStampEnv = "$env:RESTART_STAMP=\"" + stamp + "\"" + restartStampEnv = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: "WINS_RESTART_STAMP", + Value: stamp, + }, true) default: restartStampEnv = "RESTART_STAMP=" + stamp } instEnv := []string{restartStampEnv} - nodePlan.Instructions = append(nodePlan.Instructions, p.generateInstallInstruction(controlPlane, entry, instEnv)) + nodePlan.Instructions = append(nodePlan.Instructions, p.generateInstallInstruction(controlPlane, entry, instEnv, resetFailureCountOnSystemAgentRestart)) return nodePlan, nil } @@ -93,12 +110,15 @@ func (p *Planner) generateInstallInstructionWithSkipStart(controlPlane *rkev1.RK var skipStartEnv string switch entry.Metadata.Labels[capr.CattleOSLabel] { case capr.WindowsMachineOS: - skipStartEnv = fmt.Sprintf("$env:INSTALL_%s_SKIP_START=\"true\"", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))) + skipStartEnv = capr.FormatWindowsEnvVar(corev1.EnvVar{ + Name: fmt.Sprintf("INSTALL_%s_SKIP_START", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))), + Value: "true", + }, true) default: skipStartEnv = fmt.Sprintf("INSTALL_%s_SKIP_START=true", strings.ToUpper(capr.GetRuntime(controlPlane.Spec.KubernetesVersion))) } instEnv := []string{skipStartEnv} - return p.generateInstallInstruction(controlPlane, entry, instEnv) + return p.generateInstallInstruction(controlPlane, entry, instEnv, false) } func (p *Planner) addInitNodePeriodicInstruction(nodePlan plan.NodePlan, controlPlane *rkev1.RKEControlPlane) (plan.NodePlan, error) { diff --git a/pkg/capr/planner/instructions_test.go b/pkg/capr/planner/instructions_test.go index ee1582d6555..2299c4782de 100644 --- a/pkg/capr/planner/instructions_test.go +++ b/pkg/capr/planner/instructions_test.go @@ -50,7 +50,7 @@ func TestPlanner_generateInstallInstruction(t *testing.T) { command: "powershell.exe", scriptName: "run.ps1", envs: []string{}, - expectedEnvsLen: 2, + expectedEnvsLen: 3, image: "my/custom-image-", expectedImage: "my/custom-image-rke2:v1.21.5-rke2r2", }, @@ -77,8 +77,8 @@ func TestPlanner_generateInstallInstruction(t *testing.T) { os: "windows", command: "powershell.exe", scriptName: "run.ps1", - envs: []string{"$env:HTTP_PROXY", "$env:HTTPS_PROXY", "$env:INSTALL_RKE2_EXEC"}, - expectedEnvsLen: 4, + envs: []string{"HTTP_PROXY", "HTTPS_PROXY", "INSTALL_RKE2_EXEC"}, + expectedEnvsLen: 5, image: "my/custom-image-", expectedImage: "my/custom-image-rke2:v1.21.5-rke2r2", }, @@ -97,7 +97,7 @@ func TestPlanner_generateInstallInstruction(t *testing.T) { planner.retrievalFunctions.SystemAgentImage = func() string { return tt.args.image } planner.retrievalFunctions.ImageResolver = image.ResolveWithControlPlane // act - p := planner.generateInstallInstruction(controlPlane, entry, []string{}) + p := planner.generateInstallInstruction(controlPlane, entry, []string{}, false) // assert a.NotNil(p) @@ -106,7 +106,7 @@ func TestPlanner_generateInstallInstruction(t *testing.T) { a.Equal(p.Image, tt.args.expectedImage) a.Equal(tt.args.expectedEnvsLen, len(p.Env)) for _, e := range tt.args.envs { - a.True(findEnv(p.Env, e), "couldn't find %s in environment", e) + a.True(findEnvName(p.Env, e), "couldn't find %s in environment", e) } }) } @@ -149,7 +149,7 @@ func TestPlanner_addInstallInstructionWithRestartStamp(t *testing.T) { os: "windows", command: "powershell.exe", scriptName: "run.ps1", - envs: []string{"$env:RESTART_STAMP"}, + envs: []string{"WINS_RESTART_STAMP"}, image: "my/custom-image-", expectedImage: "my/custom-image-rke2:v1.21.5-rke2r2", }, @@ -166,7 +166,7 @@ func TestPlanner_addInstallInstructionWithRestartStamp(t *testing.T) { entry := createTestPlanEntry(tt.args.os) // act - p, err := planner.addInstallInstructionWithRestartStamp(plan.NodePlan{}, controlPlane, entry) + p, err := planner.addInstallInstructionWithRestartStamp(plan.NodePlan{}, controlPlane, entry, false) // assert a.Nil(err) @@ -181,7 +181,7 @@ func TestPlanner_addInstallInstructionWithRestartStamp(t *testing.T) { a.Contains(instruction.Args, tt.args.scriptName) a.GreaterOrEqual(len(instruction.Env), 1) for _, e := range tt.args.envs { - a.True(findEnv(instruction.Env, e), "couldn't find %s in environment", e) + a.True(findEnvName(instruction.Env, e), "couldn't find %s in environment", e) } }) } @@ -211,7 +211,7 @@ func TestPlanner_generateInstallInstructionWithSkipStart(t *testing.T) { os: "linux", command: "sh", scriptName: "run.sh", - envs: []string{"INSTALL_RKE2_SKIP_START=true"}, + envs: []string{"INSTALL_RKE2_SKIP_START"}, image: "my/custom-image-", expectedImage: "my/custom-image-rke2:v1.21.5-rke2r2", }, @@ -224,7 +224,7 @@ func TestPlanner_generateInstallInstructionWithSkipStart(t *testing.T) { os: "windows", command: "powershell.exe", scriptName: "run.ps1", - envs: []string{"$env:INSTALL_RKE2_SKIP_START=\"true\""}, + envs: []string{"INSTALL_RKE2_SKIP_START"}, image: "my/custom-image-", expectedImage: "my/custom-image-rke2:v1.21.5-rke2r2", }, @@ -253,7 +253,7 @@ func TestPlanner_generateInstallInstructionWithSkipStart(t *testing.T) { a.Contains(p.Args, tt.args.scriptName) a.GreaterOrEqual(len(p.Env), 1) for _, e := range tt.args.envs { - a.True(findEnv(p.Env, e), "couldn't find %s in environment", e) + a.True(findEnvName(p.Env, e), "couldn't find %s in environment", e) } }) } diff --git a/pkg/capr/planner/planentry.go b/pkg/capr/planner/planentry.go index 6e5ac2dc2f6..070a0e9fbc3 100644 --- a/pkg/capr/planner/planentry.go +++ b/pkg/capr/planner/planentry.go @@ -136,6 +136,14 @@ func isOnlyWorker(entry *planEntry) bool { return !isEtcd(entry) && !isControlPlane(entry) && isWorker(entry) } +func isOnlyWindowsWorker(entry *planEntry) bool { + return isOnlyWorker(entry) && windows(entry) +} + +func isOnlyLinuxWorker(entry *planEntry) bool { + return isOnlyWorker(entry) && !windows(entry) +} + func windows(entry *planEntry) bool { if entry == nil || entry.Metadata == nil { return false diff --git a/pkg/capr/planner/planner.go b/pkg/capr/planner/planner.go index efad35e2def..756a962047f 100644 --- a/pkg/capr/planner/planner.go +++ b/pkg/capr/planner/planner.go @@ -20,6 +20,7 @@ import ( rkev1 "github.com/rancher/rancher/pkg/apis/rke.cattle.io/v1" "github.com/rancher/rancher/pkg/apis/rke.cattle.io/v1/plan" "github.com/rancher/rancher/pkg/capr" + "github.com/rancher/rancher/pkg/controllers/capr/managesystemagent" capicontrollers "github.com/rancher/rancher/pkg/generated/controllers/cluster.x-k8s.io/v1beta1" mgmtcontrollers "github.com/rancher/rancher/pkg/generated/controllers/management.cattle.io/v3" ranchercontrollers "github.com/rancher/rancher/pkg/generated/controllers/provisioning.cattle.io/v1" @@ -361,8 +362,8 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // select all etcd and then filter to just initNodes so that unavailable count is correct err = p.reconcile(cp, clusterSecretTokens, plan, true, bootstrapTier, isEtcd, isNotInitNodeOrIsDeleting, - "1", "", - controlPlaneDrainOptions) + "1", "", controlPlaneDrainOptions, -1, 1, + false) capr.Bootstrapped.True(&status) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { @@ -382,8 +383,8 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // Process all nodes that have the etcd role and are NOT an init node or deleting. Only process 1 node at a time. err = p.reconcile(cp, clusterSecretTokens, plan, true, etcdTier, isEtcd, isInitNodeOrDeleting, - "1", joinServer, - controlPlaneDrainOptions) + "1", joinServer, controlPlaneDrainOptions, + -1, 1, false) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -391,8 +392,8 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr // Process all nodes that have the controlplane role and are NOT an init node or deleting. err = p.reconcile(cp, clusterSecretTokens, plan, true, controlPlaneTier, isControlPlane, isInitNodeOrDeleting, - controlPlaneConcurrency, joinServer, - controlPlaneDrainOptions) + controlPlaneConcurrency, joinServer, controlPlaneDrainOptions, -1, 1, + false) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -409,10 +410,31 @@ func (p *Planner) fullReconcile(cp *rkev1.RKEControlPlane, status rkev1.RKEContr return status, errWaiting("marking control plane as initialized and ready") } - // Process all nodes that are ONLY worker nodes. - err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyWorker, isInitNodeOrDeleting, - workerConcurrency, "", - workerDrainOptions) + // Process all nodes that are ONLY linux worker nodes. + err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyLinuxWorker, isInitNodeOrDeleting, + workerConcurrency, "", workerDrainOptions, -1, 1, + false) + firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) + if err != nil { + return status, err + } + + // Process all nodes that are ONLY windows worker nodes. + // This conditional can be removed once the minimum version of rke2 + // supported by Rancher is v1.31.0, and '5' can then always be passed + // to 'reconcile' when processing Windows node plans. + resetFailureCountOnRestart := false + windowsMaxFailures := 1 + windowsMaxFailureThreshold := -1 + if managesystemagent.CurrentVersionResolvesGH5551(cp.Spec.KubernetesVersion) { + windowsMaxFailures = 5 + windowsMaxFailureThreshold = 5 + resetFailureCountOnRestart = true + } + + err = p.reconcile(cp, clusterSecretTokens, plan, false, workerTier, isOnlyWindowsWorker, isInitNodeOrDeleting, + workerConcurrency, "", workerDrainOptions, windowsMaxFailures, + windowsMaxFailureThreshold, resetFailureCountOnRestart) firstIgnoreError, err = ignoreErrors(firstIgnoreError, err) if err != nil { return status, err @@ -829,8 +851,9 @@ type reconcilable struct { minorChange bool } -func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, required bool, - tierName string, include, exclude roleFilter, maxUnavailable string, forcedJoinURL string, drainOptions rkev1.DrainOptions) error { +func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, clusterPlan *plan.Plan, required bool, tierName string, + include, exclude roleFilter, maxUnavailable, forcedJoinURL string, drainOptions rkev1.DrainOptions, + maxFailures, failureThreshold int, resetFailureCountOnSystemAgentRestart bool) error { var ( ready, outOfSync, nonReady, errMachines, draining, uncordoned []string messages = map[string][]string{} @@ -851,7 +874,7 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl } logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - rendering desired plan for machine %s/%s with join URL: (%s)", controlPlane.Namespace, controlPlane.Name, tierName, entry.Machine.Namespace, entry.Machine.Name, joinURL) - plan, joinedURL, err := p.desiredPlan(controlPlane, tokensSecret, entry, joinURL) + plan, joinedURL, err := p.desiredPlan(controlPlane, tokensSecret, entry, joinURL, resetFailureCountOnSystemAgentRestart) if err != nil { return err } @@ -901,14 +924,14 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - setting initial plan for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - initial plan for machine %s/%s new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } } else if r.minorChange { logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - minor plan change detected for machine %s/%s, updating plan immediately", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - minor plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) outOfSync = append(outOfSync, r.entry.Machine.Name) - if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err := p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } } else if r.change { @@ -932,7 +955,7 @@ func (p *Planner) reconcile(controlPlane *rkev1.RKEControlPlane, tokensSecret pl // Drain is done (or didn't need to be done) and there are no errors, so the plan should be updated to enact the reason the node was drained. logrus.Debugf("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name) logrus.Tracef("[planner] rkecluster %s/%s reconcile tier %s - major plan change for machine %s/%s old: %+v, new: %+v", controlPlane.Namespace, controlPlane.Name, tierName, r.entry.Machine.Namespace, r.entry.Machine.Name, r.entry.Plan.Plan, r.desiredPlan) - if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, -1, 1); err != nil { + if err = p.store.UpdatePlan(r.entry, r.desiredPlan, r.joinedURL, maxFailures, failureThreshold); err != nil { return err } else if r.entry.Metadata.Annotations[capr.DrainDoneAnnotation] != "" { messages[r.entry.Machine.Name] = append(messages[r.entry.Machine.Name], "drain completed") @@ -1083,7 +1106,7 @@ func (p *Planner) generatePlanWithConfigFiles(controlPlane *rkev1.RKEControlPlan return plan.NodePlan{}, map[string]interface{}{}, "", nil } -func (p *Planner) desiredPlan(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, entry *planEntry, joinServer string) (plan.NodePlan, string, error) { +func (p *Planner) desiredPlan(controlPlane *rkev1.RKEControlPlane, tokensSecret plan.Secret, entry *planEntry, joinServer string, resetFailureCountOnSystemAgentRestart bool) (plan.NodePlan, string, error) { nodePlan, config, joinedTo, err := p.generatePlanWithConfigFiles(controlPlane, tokensSecret, entry, joinServer, true) if err != nil { return nodePlan, joinedTo, err @@ -1096,7 +1119,7 @@ func (p *Planner) desiredPlan(controlPlane *rkev1.RKEControlPlane, tokensSecret nodePlan.Probes = probes // Add instruction last because it hashes config content - nodePlan, err = p.addInstallInstructionWithRestartStamp(nodePlan, controlPlane, entry) + nodePlan, err = p.addInstallInstructionWithRestartStamp(nodePlan, controlPlane, entry, resetFailureCountOnSystemAgentRestart) if err != nil { return nodePlan, joinedTo, err } diff --git a/pkg/capr/planner/planner_test.go b/pkg/capr/planner/planner_test.go index c48cb073a0d..71c2e3de517 100644 --- a/pkg/capr/planner/planner_test.go +++ b/pkg/capr/planner/planner_test.go @@ -124,7 +124,7 @@ func TestPlanner_addInstruction(t *testing.T) { os: "windows", command: "powershell.exe", scriptName: "run.ps1", - envs: []string{"$env:RESTART_STAMP", "$env:INSTALL_RKE2_EXEC"}, + envs: []string{"WINS_RESTART_STAMP", "INSTALL_RKE2_EXEC"}, }, }, { @@ -150,7 +150,7 @@ func TestPlanner_addInstruction(t *testing.T) { planner.retrievalFunctions.ImageResolver = image.ResolveWithControlPlane planner.retrievalFunctions.GetBootstrapManifests = func(cp *rkev1.RKEControlPlane) ([]plan.File, error) { return nil, nil } // act - p, err := planner.addInstallInstructionWithRestartStamp(plan.NodePlan{}, controlPlane, entry) + p, err := planner.addInstallInstructionWithRestartStamp(plan.NodePlan{}, controlPlane, entry, false) // assert a.Nil(err) @@ -162,7 +162,7 @@ func TestPlanner_addInstruction(t *testing.T) { a.Contains(instruction.Image, tt.args.expectedVersion) a.Contains(instruction.Args, tt.args.scriptName) for _, e := range tt.args.envs { - a.True(findEnv(instruction.Env, e), "couldn't find %s in environment", e) + a.True(findEnvName(instruction.Env, e), "couldn't find %s in environment", e) } }) } @@ -213,9 +213,13 @@ func createTestPlanEntryWithoutRoles(os string) *planEntry { return entry } -func findEnv(s []string, v string) bool { +func findEnvName(s []string, v string) bool { for _, item := range s { - if strings.Contains(item, v) { + split := strings.Split(item, "=") + if len(split) != 2 { + return false + } + if split[0] == v { return true } } diff --git a/pkg/controllers/capr/managesystemagent/managesystemagent.go b/pkg/controllers/capr/managesystemagent/managesystemagent.go index 0e7d6600384..e6529a2d47d 100644 --- a/pkg/controllers/capr/managesystemagent/managesystemagent.go +++ b/pkg/controllers/capr/managesystemagent/managesystemagent.go @@ -295,7 +295,7 @@ func installer(cluster *rancherv1.Cluster, secretName string) []runtime.Object { } plans = append(plans, plan) - if currentVersionResolvesGH5551(cluster.Spec.KubernetesVersion) { + if CurrentVersionResolvesGH5551(cluster.Spec.KubernetesVersion) { windowsPlan := winsUpgradePlan(cluster, env, secretName) if cluster.Spec.RedeploySystemAgentGeneration != 0 { windowsPlan.Spec.Secrets = append(windowsPlan.Spec.Secrets, upgradev1.SecretSpec{ @@ -427,11 +427,11 @@ func toStringPointer(x string) *string { return &x } -// currentVersionResolvesGH5551 determines if the given rke2 version +// CurrentVersionResolvesGH5551 determines if the given rke2 version // has fixed the RKE2 bug outlined in GH-5551. Windows SUC plans cannot be delivered // to clusters running versions containing this bug. This function can be removed // when v1.31.x is the lowest supported version offered by Rancher. -func currentVersionResolvesGH5551(version string) bool { +func CurrentVersionResolvesGH5551(version string) bool { // remove leading v and trailing distro identifier v := strings.TrimPrefix(version, "v") diff --git a/pkg/controllers/capr/managesystemagent/managesystemagent_test.go b/pkg/controllers/capr/managesystemagent/managesystemagent_test.go index ed59efd4c3a..fad899e5c11 100644 --- a/pkg/controllers/capr/managesystemagent/managesystemagent_test.go +++ b/pkg/controllers/capr/managesystemagent/managesystemagent_test.go @@ -76,7 +76,7 @@ func Test_CurrentVersionResolvesGH5551(t *testing.T) { for _, tc := range tests { tc := tc t.Run(tc.name, func(t *testing.T) { - shouldCreatePlan := currentVersionResolvesGH5551(tc.currentVersion) + shouldCreatePlan := CurrentVersionResolvesGH5551(tc.currentVersion) if shouldCreatePlan != tc.expectedResult { t.Logf("expected %t when providing rke2 version %s but got %t", tc.expectedResult, tc.currentVersion, shouldCreatePlan) t.Fail() diff --git a/pkg/controllers/capr/plansecret/plansecret.go b/pkg/controllers/capr/plansecret/plansecret.go index 4e86fca9344..ec61a920781 100644 --- a/pkg/controllers/capr/plansecret/plansecret.go +++ b/pkg/controllers/capr/plansecret/plansecret.go @@ -73,6 +73,9 @@ func (h *handler) OnChange(key string, secret *corev1.Secret) (*corev1.Secret, e failedChecksum := string(secret.Data["failed-checksum"]) plan := secret.Data["plan"] + failureCount := string(secret.Data["failure-count"]) + maxFailures := string(secret.Data["max-failures"]) + secretChanged := false secret = secret.DeepCopy() @@ -103,7 +106,9 @@ func (h *handler) OnChange(key string, secret *corev1.Secret) (*corev1.Secret, e if failedChecksum == planner.PlanHash(plan) { logrus.Debugf("[plansecret] %s/%s: rv: %s: Detected failed plan application, reconciling machine PlanApplied condition to error", secret.Namespace, secret.Name, secret.ResourceVersion) - err = h.reconcileMachinePlanAppliedCondition(secret, fmt.Errorf("error applying plan -- check rancher-system-agent.service logs on node for more information")) + if maxFailures == "-1" || failureCount == maxFailures { + err = h.reconcileMachinePlanAppliedCondition(secret, fmt.Errorf("error applying plan -- check rancher-system-agent.service logs on node for more information")) + } return secret, err }