Skip to content

Commit

Permalink
Add exponential backoff when apps are failing that has a ceiling of s…
Browse files Browse the repository at this point in the history
…yncPeriod
  • Loading branch information
ewrenn8 committed Jan 23, 2021
1 parent 0daa6e6 commit d74a210
Show file tree
Hide file tree
Showing 8 changed files with 325 additions and 77 deletions.
3 changes: 3 additions & 0 deletions pkg/apis/kappctrl/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ type AppStatus struct {
Deploy *AppStatusDeploy `json:"deploy,omitempty"`
Inspect *AppStatusInspect `json:"inspect,omitempty"`

ConsecutiveReconcileSuccesses int `json:"consecutiveReconcileSuccesses,omitempty"`
ConsecutiveReconcileFailures int `json:"consecutiveReconcileFailures,omitempty"`

ObservedGeneration int64 `json:"observedGeneration"`
Conditions []AppCondition `json:"conditions"`

Expand Down
81 changes: 80 additions & 1 deletion pkg/apis/kappctrl/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 6 additions & 66 deletions pkg/app/app_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"github.com/vmware-tanzu/carvel-kapp-controller/pkg/memdir"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

Expand All @@ -37,7 +36,7 @@ func (a *App) Reconcile() (reconcile.Result, error) {

err = a.reconcileDelete()

case a.shouldReconcile(time.Now()):
case NewReconcileTimer(a.app).IsReadyAt(time.Now()):
a.log.Info("Started deploy")
defer func() { a.log.Info("Completed deploy") }()

Expand All @@ -47,7 +46,7 @@ func (a *App) Reconcile() (reconcile.Result, error) {
a.log.Info("Reconcile noop")
}

return a.requeueIfNecessary(), err
return reconcile.Result{RequeueAfter: NewReconcileTimer(a.app).DurationUntilReady(err)}, err
}

func (a *App) reconcileDelete() error {
Expand Down Expand Up @@ -221,13 +220,17 @@ func (a *App) setReconcileCompleted(result exec.CmdRunResult) {
Status: corev1.ConditionTrue,
Message: result.ErrorStr(),
})
a.app.Status.ConsecutiveReconcileFailures++
a.app.Status.ConsecutiveReconcileSuccesses = 0
a.app.Status.FriendlyDescription = fmt.Sprintf("Reconcile failed: %s", result.ErrorStr())
} else {
a.app.Status.Conditions = append(a.app.Status.Conditions, v1alpha1.AppCondition{
Type: v1alpha1.ReconcileSucceeded,
Status: corev1.ConditionTrue,
Message: "",
})
a.app.Status.ConsecutiveReconcileSuccesses++
a.app.Status.ConsecutiveReconcileFailures = 0
a.app.Status.FriendlyDescription = "Reconcile succeeded"
}
}
Expand Down Expand Up @@ -258,69 +261,6 @@ func (a *App) setDeleteCompleted(result exec.CmdRunResult) {
}
}

func (a *App) syncPeriod() time.Duration {
const DefaultSyncPeriod = 30 * time.Second
if sp := a.app.Spec.SyncPeriod; sp != nil && sp.Duration > DefaultSyncPeriod {
return sp.Duration
}
return DefaultSyncPeriod
}

func (a *App) requeueIfNecessary() reconcile.Result {
var (
shortDelay = 4 * time.Second
// Must always be >= tooLongAfterSuccess so that we dont requeue
// without work to do
// replace last 5 seconds with int from range [5,10]
longerDelay = a.syncPeriod() - 5 + wait.Jitter(5*time.Second, 1.0)
)

if a.shouldReconcile(time.Now().Add(shortDelay)) {
return reconcile.Result{RequeueAfter: shortDelay}
}
return reconcile.Result{RequeueAfter: longerDelay}
}

func (a *App) shouldReconcile(timeAt time.Time) bool {
const (
tooLongAfterFailure = 3 * time.Second
)
tooLongAfterSuccess := a.syncPeriod()

// Did resource spec change?
if a.app.Status.ObservedGeneration != a.app.Generation {
return true
}

// If canceled/paused, then no reconcilation until unpaused
if a.app.Spec.Canceled || a.app.Spec.Paused {
return false
}

// Did we deploy at least once?
lastDeploy := a.app.Status.Deploy
if lastDeploy == nil {
return true
}

// Did previous deploy fail?
for _, cond := range a.app.Status.Conditions {
if cond.Type == v1alpha1.ReconcileFailed {
// Did we try too long ago?
if timeAt.UTC().Sub(lastDeploy.UpdatedAt.Time) > tooLongAfterFailure {
return true
}
}
}

// Did we deploy too long ago?
if timeAt.UTC().Sub(lastDeploy.UpdatedAt.Time) > tooLongAfterSuccess {
return true
}

return false
}

func (a *App) removeAllConditions() {
a.app.Status.Conditions = nil
}
Expand Down
86 changes: 86 additions & 0 deletions pkg/app/reconcile_timer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package app

import (
"math"
"time"

"github.com/vmware-tanzu/carvel-kapp-controller/pkg/apis/kappctrl/v1alpha1"
"k8s.io/apimachinery/pkg/util/wait"
)

type ReconcileTimer struct {
app v1alpha1.App
}

func NewReconcileTimer(app v1alpha1.App) ReconcileTimer {
return ReconcileTimer{*app.DeepCopy()}
}

func (rt ReconcileTimer) DurationUntilReady(err error) time.Duration {
if err != nil || rt.hasReconcileStatus(v1alpha1.ReconcileFailed) {
return rt.failureSyncPeriod()
}

return rt.applyJitter(rt.syncPeriod())
}

func (rt ReconcileTimer) IsReadyAt(timeAt time.Time) bool {
// Did resource spec change?
if rt.app.Status.ObservedGeneration != rt.app.Generation {
return true
}

// If canceled/paused, then no reconcilation until unpaused
if rt.app.Spec.Canceled || rt.app.Spec.Paused {
return false
}

// Did we deploy at least once?
lastFetch := rt.app.Status.Fetch
if lastFetch == nil {
return true
}

if rt.hasReconcileStatus(v1alpha1.ReconcileFailed) {
if timeAt.UTC().Sub(lastFetch.UpdatedAt.Time) >= rt.failureSyncPeriod() {
return true
}
}

// Did we deploy too long ago?
if timeAt.UTC().Sub(lastFetch.UpdatedAt.Time) >= rt.syncPeriod() {
return true
}

return false
}

func (rt ReconcileTimer) syncPeriod() time.Duration {
const defaultSyncPeriod = 30 * time.Second
if sp := rt.app.Spec.SyncPeriod; sp != nil && sp.Duration > defaultSyncPeriod {
return sp.Duration
}
return defaultSyncPeriod
}

func (rt ReconcileTimer) failureSyncPeriod() time.Duration {
d := time.Duration(math.Exp2(float64(rt.app.Status.ConsecutiveReconcileFailures))) * time.Second
if d < rt.syncPeriod() {
return d
}
return rt.syncPeriod()
}

func (rt ReconcileTimer) hasReconcileStatus(c v1alpha1.AppConditionType) bool {
for _, cond := range rt.app.Status.Conditions {
if cond.Type == c {
return true
}
}
return false
}

func (rt ReconcileTimer) applyJitter(t time.Duration) time.Duration {
const appJitter time.Duration = 5 * time.Second
return t - appJitter + wait.Jitter(appJitter, 1.0)
}
Loading

0 comments on commit d74a210

Please sign in to comment.