Skip to content

Commit

Permalink
CO-2897 trigger controller restart instead of crashing the agent
Browse files Browse the repository at this point in the history
  • Loading branch information
Tiberiu Gal committed Mar 1, 2024
1 parent 6b00b06 commit d01dde6
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
github.com/stretchr/testify v1.8.4
go.uber.org/goleak v1.2.1
golang.org/x/net v0.18.0
golang.org/x/sync v0.4.0
k8s.io/api v0.28.3
k8s.io/apimachinery v0.28.3
k8s.io/client-go v0.28.3
Expand Down
13 changes: 7 additions & 6 deletions internal/services/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
karpenter "github.com/aws/karpenter/pkg/apis/v1beta1"
"github.com/samber/lo"
"github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"
appsv1 "k8s.io/api/apps/v1"
authorizationv1 "k8s.io/api/authorization/v1"
autoscalingv1 "k8s.io/api/autoscaling/v1"
Expand Down Expand Up @@ -157,7 +158,7 @@ func New(

func (c *Controller) Run(ctx context.Context) error {
defer c.queue.ShutDown()
var controllerRuntimeError error
g := new(errgroup.Group)
ctx, cancel := context.WithCancel(ctx)
defer cancel()

Expand Down Expand Up @@ -206,16 +207,15 @@ func (c *Controller) Run(ctx context.Context) error {
}, dur, ctx.Done())
}()

go func() {
g.Go(func() error {
if err := c.collectInitialSnapshot(ctx); err != nil {
const maxItems = 5
queueContent := c.debugQueueContent(maxItems)
log := c.log.WithField("queue_content", queueContent)
log.Errorf("error while collecting initial snapshot: %v", err)
c.log.Infof("restarting agent after failure to collect initial snapshot")
controllerRuntimeError = err
c.triggerRestart()
return
return err
}

// Since both initial snapshot collection and event handlers writes to the same delta queue add
Expand All @@ -229,7 +229,8 @@ func (c *Controller) Run(ctx context.Context) error {
wait.Until(func() {
c.send(ctx)
}, c.cfg.Interval, ctx.Done())
}()
return nil
})

go c.startConditionalInformersWithWatcher(ctx, c.conditionalInformers)

Expand All @@ -240,7 +241,7 @@ func (c *Controller) Run(ctx context.Context) error {

c.pollQueueUntilShutdown()

return controllerRuntimeError
return g.Wait()
}

func (c *Controller) startConditionalInformersWithWatcher(ctx context.Context, conditionalInformers []conditionalInformer) {
Expand Down

0 comments on commit d01dde6

Please sign in to comment.