feat: Improve rolling upgrade speed and add SLOW_MODE env var to reta…

…in old behavior
TwiN · Oct 8, 2022 · 2b913fd · 2b913fd
1 parent d5a77ee
commit 2b913fd
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -53,7 +53,10 @@ Therefore, this application will not run into any issues if it is restarted, res
 | EXECUTION_TIMEOUT            | Maximum execution duration before timing out in seconds                                                                                                                                                                | no       | `900`       |
 | POD_TERMINATION_GRACE_PERIOD | How long to wait for a pod to terminate in seconds; 0 means "delete immediately"; set to a negative value to use the pod's terminationGracePeriodSeconds.                                                              | no       | `-1`        |
 | METRICS_PORT                 | Port to bind metrics server to                                                                                                                                                                                         | no       | `8080`      |
-| METRICS                      | Expose metrics in Promtheus format at `:${METRICS_PORT}/metrics`                                                                                                                                                       | no       | `""`        | 
+| METRICS                      | Expose metrics in Prometheus format at `:${METRICS_PORT}/metrics`                                                                                                                                                      | no       | `""`        | 
+| SLOW_MODE                    | If enabled, every time a node is terminated during an execution, the current execution will stop rather than continuing to the next ASG                                                                                | no       | `false`     | 
+
+**NOTE:** Only one of `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` or `AUTO_SCALING_GROUP_NAMES` must be set.
 
 
 ## Metrics

diff --git a/config/config.go b/config/config.go
@@ -26,6 +26,7 @@ const (
 	EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD"
 	EnvMetrics                   = "METRICS"
 	EnvMetricsPort               = "METRICS_PORT"
+	EnvSlowMode                  = "SLOW_MODE"
 )
 
 type config struct {
@@ -41,13 +42,15 @@ type config struct {
 	PodTerminationGracePeriod int           // Defaults to -1
 	Metrics                   bool          // Defaults to false
 	MetricsPort               int           // Defaults to 8080
+	SlowMode                  bool          // Defaults to false
 }
 
 // Initialize is used to initialize the application's configuration
 func Initialize() error {
 	cfg = &config{
 		Environment: strings.ToLower(os.Getenv(EnvEnvironment)),
 		Debug:       strings.ToLower(os.Getenv(EnvDebug)) == "true",
+		SlowMode:    strings.ToLower(os.Getenv(EnvSlowMode)) == "true",
 	}
 	if clusterName := os.Getenv(EnvClusterName); len(clusterName) > 0 {
 		cfg.AutodiscoveryTags = fmt.Sprintf("k8s.io/cluster-autoscaler/%s=owned,k8s.io/cluster-autoscaler/enabled=true", clusterName)
@@ -103,8 +106,8 @@ func Initialize() error {
 		log.Printf("Environment variable '%s' not specified, defaulting to 20 seconds", EnvExecutionInterval)
 		cfg.ExecutionInterval = time.Second * 20
 	}
-	if executionTImeout := os.Getenv(EnvExecutionTimeout); len(executionTImeout) > 0 {
-		if timeout, err := strconv.Atoi(executionTImeout); err != nil {
+	if executionTimeout := os.Getenv(EnvExecutionTimeout); len(executionTimeout) > 0 {
+		if timeout, err := strconv.Atoi(executionTimeout); err != nil {
 			return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionTimeout)
 		} else {
 			cfg.ExecutionTimeout = time.Second * time.Duration(timeout)

diff --git a/main.go b/main.go
@@ -117,7 +117,6 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
 			log.Printf("[%s] Skipping because unable to separate outdated instances from updated instances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), err.Error())
 			continue
 		}
-		fmt.Printf("[%s] Found %d outdated instances and %d updated instances\n", aws.StringValue(autoScalingGroup.AutoScalingGroupName), len(outdatedInstances), len(updatedInstances))
 		metrics.Server.UpdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(updatedInstances)))
 		metrics.Server.OutdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(outdatedInstances)))
 		if config.Get().Debug {
@@ -204,7 +203,12 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
 					// As a result, we return here to make sure that multiple old instances didn't use the same updated
 					// instances to calculate resources available
 					log.Printf("[%s][%s] Node has been drained and scheduled for termination successfully", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
-					return true
+					if config.Get().SlowMode {
+						// If SlowMode is enabled, we'll return after draining a node and wait for the next execution
+						return true
+					}
+					// Move on to the next ASG
+					break
 				} else {
 					// Don't increase the ASG if the node has already been drained or scheduled for termination
 					if minutesSinceDrained != -1 || minutesSinceTerminated != -1 {