diff --git a/actions/kubernetes_helpers.go b/actions/kubernetes_helpers.go index 635f026e..df9bed51 100644 --- a/actions/kubernetes_helpers.go +++ b/actions/kubernetes_helpers.go @@ -7,6 +7,7 @@ import ( "time" "github.com/cenkalti/backoff/v4" + "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" apitypes "k8s.io/apimachinery/pkg/types" @@ -45,6 +46,26 @@ func patchNode(ctx context.Context, clientset kubernetes.Interface, node *v1.Nod return nil } +func getNodeForPatching(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, nodeName string) (*v1.Node, error) { + logRetry := func(err error, _ time.Duration) { + log.Warnf("getting node, will retry: %v", err) + } + var node *v1.Node + b := backoff.WithMaxRetries(backoff.NewExponentialBackOff(), 3) + err := backoff.RetryNotify(func() error { + var err error + node, err = clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + return err + }, b, logRetry) + if err != nil { + return nil, err + } + return node, nil +} + func defaultBackoff(ctx context.Context) backoff.BackOffContext { return backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(500*time.Millisecond), 5), ctx) // nolint:gomnd } diff --git a/actions/patch_node_handler.go b/actions/patch_node_handler.go index 50e57711..83f78430 100644 --- a/actions/patch_node_handler.go +++ b/actions/patch_node_handler.go @@ -10,7 +10,6 @@ import ( "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "github.com/castai/cluster-controller/castai" @@ -56,10 +55,14 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct "id": action.ID, }) - node, err := h.clientset.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + // on GKE we noticed that sometimes the node is not found, even though it is in the cluster + // as result was returned from watch. But subsequent get request returns not found. + // This in theory should not happen as get should be consistent with api server state. + // But we have seen this happening, so we retry the get request. + node, err := getNodeForPatching(ctx, h.log, h.clientset, req.NodeName) if err != nil { if apierrors.IsNotFound(err) { - log.Info("node not found, skipping patch") + log.WithError(err).Infof("node not found, skipping patch") return nil } return err