Skip to content

Commit

Permalink
Add 'node get' retry (#86)
Browse files Browse the repository at this point in the history
add node get retry

Co-authored-by: Mangirdas <[email protected]>
  • Loading branch information
mjudeikis and mjudeikis authored Sep 28, 2023
1 parent a81c1f5 commit d57bac9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
21 changes: 21 additions & 0 deletions actions/kubernetes_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/cenkalti/backoff/v4"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
apitypes "k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -45,6 +46,26 @@ func patchNode(ctx context.Context, clientset kubernetes.Interface, node *v1.Nod
return nil
}

func getNodeForPatching(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, nodeName string) (*v1.Node, error) {
logRetry := func(err error, _ time.Duration) {
log.Warnf("getting node, will retry: %v", err)
}
var node *v1.Node
b := backoff.WithMaxRetries(backoff.NewExponentialBackOff(), 3)
err := backoff.RetryNotify(func() error {
var err error
node, err = clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil {
return err
}
return err
}, b, logRetry)
if err != nil {
return nil, err
}
return node, nil
}

func defaultBackoff(ctx context.Context) backoff.BackOffContext {
return backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(500*time.Millisecond), 5), ctx) // nolint:gomnd
}
9 changes: 6 additions & 3 deletions actions/patch_node_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"

"github.com/castai/cluster-controller/castai"
Expand Down Expand Up @@ -56,10 +55,14 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
"id": action.ID,
})

node, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{})
// on GKE we noticed that sometimes the node is not found, even though it is in the cluster
// as result was returned from watch. But subsequent get request returns not found.
// This in theory should not happen as get should be consistent with api server state.
// But we have seen this happening, so we retry the get request.
node, err := getNodeForPatching(ctx, h.log, h.clientset, req.NodeName)
if err != nil {
if apierrors.IsNotFound(err) {
log.Info("node not found, skipping patch")
log.WithError(err).Infof("node not found, skipping patch")
return nil
}
return err
Expand Down

0 comments on commit d57bac9

Please sign in to comment.