Skip to content

Commit

Permalink
feat: Improve rolling upgrade speed and add SLOW_MODE env var to reta…
Browse files Browse the repository at this point in the history
…in old behavior
  • Loading branch information
TwiN committed Oct 8, 2022
1 parent d5a77ee commit 2b913fd
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ Therefore, this application will not run into any issues if it is restarted, res
| EXECUTION_TIMEOUT | Maximum execution duration before timing out in seconds | no | `900` |
| POD_TERMINATION_GRACE_PERIOD | How long to wait for a pod to terminate in seconds; 0 means "delete immediately"; set to a negative value to use the pod's terminationGracePeriodSeconds. | no | `-1` |
| METRICS_PORT | Port to bind metrics server to | no | `8080` |
| METRICS | Expose metrics in Promtheus format at `:${METRICS_PORT}/metrics` | no | `""` |
| METRICS | Expose metrics in Prometheus format at `:${METRICS_PORT}/metrics` | no | `""` |
| SLOW_MODE | If enabled, every time a node is terminated during an execution, the current execution will stop rather than continuing to the next ASG | no | `false` |

**NOTE:** Only one of `CLUSTER_NAME`, `AUTODISCOVERY_TAGS` or `AUTO_SCALING_GROUP_NAMES` must be set.


## Metrics
Expand Down
7 changes: 5 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const (
EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD"
EnvMetrics = "METRICS"
EnvMetricsPort = "METRICS_PORT"
EnvSlowMode = "SLOW_MODE"
)

type config struct {
Expand All @@ -41,13 +42,15 @@ type config struct {
PodTerminationGracePeriod int // Defaults to -1
Metrics bool // Defaults to false
MetricsPort int // Defaults to 8080
SlowMode bool // Defaults to false
}

// Initialize is used to initialize the application's configuration
func Initialize() error {
cfg = &config{
Environment: strings.ToLower(os.Getenv(EnvEnvironment)),
Debug: strings.ToLower(os.Getenv(EnvDebug)) == "true",
SlowMode: strings.ToLower(os.Getenv(EnvSlowMode)) == "true",
}
if clusterName := os.Getenv(EnvClusterName); len(clusterName) > 0 {
cfg.AutodiscoveryTags = fmt.Sprintf("k8s.io/cluster-autoscaler/%s=owned,k8s.io/cluster-autoscaler/enabled=true", clusterName)
Expand Down Expand Up @@ -103,8 +106,8 @@ func Initialize() error {
log.Printf("Environment variable '%s' not specified, defaulting to 20 seconds", EnvExecutionInterval)
cfg.ExecutionInterval = time.Second * 20
}
if executionTImeout := os.Getenv(EnvExecutionTimeout); len(executionTImeout) > 0 {
if timeout, err := strconv.Atoi(executionTImeout); err != nil {
if executionTimeout := os.Getenv(EnvExecutionTimeout); len(executionTimeout) > 0 {
if timeout, err := strconv.Atoi(executionTimeout); err != nil {
return fmt.Errorf("environment variable '%s' must be an integer", EnvExecutionTimeout)
} else {
cfg.ExecutionTimeout = time.Second * time.Duration(timeout)
Expand Down
8 changes: 6 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
log.Printf("[%s] Skipping because unable to separate outdated instances from updated instances: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), err.Error())
continue
}
fmt.Printf("[%s] Found %d outdated instances and %d updated instances\n", aws.StringValue(autoScalingGroup.AutoScalingGroupName), len(outdatedInstances), len(updatedInstances))
metrics.Server.UpdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(updatedInstances)))
metrics.Server.OutdatedNodes.WithLabelValues(aws.StringValue(autoScalingGroup.AutoScalingGroupName)).Set(float64(len(outdatedInstances)))
if config.Get().Debug {
Expand Down Expand Up @@ -204,7 +203,12 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
// As a result, we return here to make sure that multiple old instances didn't use the same updated
// instances to calculate resources available
log.Printf("[%s][%s] Node has been drained and scheduled for termination successfully", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
return true
if config.Get().SlowMode {
// If SlowMode is enabled, we'll return after draining a node and wait for the next execution
return true
}
// Move on to the next ASG
break
} else {
// Don't increase the ASG if the node has already been drained or scheduled for termination
if minutesSinceDrained != -1 || minutesSinceTerminated != -1 {
Expand Down

0 comments on commit 2b913fd

Please sign in to comment.