Skip to content

Commit

Permalink
Merge pull request #159 from castai/fix-CO-2897-Agent-Handle-Crashing
Browse files Browse the repository at this point in the history
fix: log error and push for clean restart instead of crashing the agent
  • Loading branch information
Tiberiu Gal authored Mar 7, 2024
2 parents 5333562 + d01dde6 commit 31ce7f8
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
github.com/stretchr/testify v1.8.4
go.uber.org/goleak v1.2.1
golang.org/x/net v0.18.0
golang.org/x/sync v0.4.0
k8s.io/api v0.28.3
k8s.io/apimachinery v0.28.3
k8s.io/client-go v0.28.3
Expand Down
18 changes: 10 additions & 8 deletions internal/services/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
karpenter "github.com/aws/karpenter/pkg/apis/v1beta1"
"github.com/samber/lo"
"github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"
appsv1 "k8s.io/api/apps/v1"
authorizationv1 "k8s.io/api/authorization/v1"
autoscalingv1 "k8s.io/api/autoscaling/v1"
Expand Down Expand Up @@ -157,7 +158,7 @@ func New(

func (c *Controller) Run(ctx context.Context) error {
defer c.queue.ShutDown()

g := new(errgroup.Group)
ctx, cancel := context.WithCancel(ctx)
defer cancel()

Expand Down Expand Up @@ -206,15 +207,15 @@ func (c *Controller) Run(ctx context.Context) error {
}, dur, ctx.Done())
}()

go func() {
g.Go(func() error {
if err := c.collectInitialSnapshot(ctx); err != nil {
const maxItems = 5
queueContent := c.debugQueueContent(maxItems)
log := c.log.WithField("queue_content", queueContent)
// Crash agent in case it's not able to collect full snapshot from informers cache.
// TODO (CO-1632): refactor crashing to "normal" exit or healthz metric; abruptly
// stopping the agent does not give it a chance to release leader lock.
log.Fatalf("error while collecting initial snapshot: %v", err)
log.Errorf("error while collecting initial snapshot: %v", err)
c.log.Infof("restarting agent after failure to collect initial snapshot")
c.triggerRestart()
return err
}

// Since both initial snapshot collection and event handlers writes to the same delta queue add
Expand All @@ -228,7 +229,8 @@ func (c *Controller) Run(ctx context.Context) error {
wait.Until(func() {
c.send(ctx)
}, c.cfg.Interval, ctx.Done())
}()
return nil
})

go c.startConditionalInformersWithWatcher(ctx, c.conditionalInformers)

Expand All @@ -239,7 +241,7 @@ func (c *Controller) Run(ctx context.Context) error {

c.pollQueueUntilShutdown()

return nil
return g.Wait()
}

func (c *Controller) startConditionalInformersWithWatcher(ctx context.Context, conditionalInformers []conditionalInformer) {
Expand Down

0 comments on commit 31ce7f8

Please sign in to comment.