From 03fbcafe68e186f6c82aee81bea23f5a2a7b15df Mon Sep 17 00:00:00 2001 From: Markus Rudy Date: Tue, 5 Mar 2024 09:14:01 +0100 Subject: [PATCH] bootstrapper: bounded retry of k8s join (#2968) --- .../internal/joinclient/joinclient.go | 11 ++++++- .../internal/joinclient/joinclient_test.go | 33 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/bootstrapper/internal/joinclient/joinclient.go b/bootstrapper/internal/joinclient/joinclient.go index 110b52a662..8f44fa1153 100644 --- a/bootstrapper/internal/joinclient/joinclient.go +++ b/bootstrapper/internal/joinclient/joinclient.go @@ -150,6 +150,7 @@ func (c *JoinClient) Start(cleaner cleaner) { return } else if isUnrecoverable(err) { c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred") + // TODO(burgerdev): this should eventually lead to a full node reset return } c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints") @@ -310,7 +311,15 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse, CACertHashes: []string{ticket.DiscoveryTokenCaCertHash}, } - if err := c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log); err != nil { + // We currently cannot recover from any failure in this function. Joining the k8s cluster + // sometimes fails transiently, and we don't want to brick the node because of that. + for i := 0; i < 3; i++ { + err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log) + if err != nil { + c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err) + } + } + if err != nil { return fmt.Errorf("joining Kubernetes cluster: %w", err) } diff --git a/bootstrapper/internal/joinclient/joinclient_test.go b/bootstrapper/internal/joinclient/joinclient_test.go index 4684b2eb49..d22ed4fb97 100644 --- a/bootstrapper/internal/joinclient/joinclient_test.go +++ b/bootstrapper/internal/joinclient/joinclient_test.go @@ -62,6 +62,7 @@ func TestClient(t *testing.T) { apiAnswers []any wantLock bool wantJoin bool + wantNumJoins int }{ "on worker: metadata self: errors occur": { role: role.Worker, @@ -168,12 +169,26 @@ func TestClient(t *testing.T) { listAnswer{instances: peers}, issueJoinTicketAnswer{}, }, - clusterJoiner: &stubClusterJoiner{joinClusterErr: someErr}, + clusterJoiner: &stubClusterJoiner{numBadCalls: -1, joinClusterErr: someErr}, nodeLock: newFakeLock(), disk: &stubDisk{}, wantJoin: true, wantLock: true, }, + "on control plane: joinCluster fails transiently": { + role: role.ControlPlane, + apiAnswers: []any{ + selfAnswer{instance: controlSelf}, + listAnswer{instances: peers}, + issueJoinTicketAnswer{}, + }, + clusterJoiner: &stubClusterJoiner{numBadCalls: 1, joinClusterErr: someErr}, + nodeLock: newFakeLock(), + disk: &stubDisk{}, + wantJoin: true, + wantLock: true, + wantNumJoins: 2, + }, "on control plane: node already locked": { role: role.ControlPlane, apiAnswers: []any{ @@ -250,9 +265,12 @@ func TestClient(t *testing.T) { client.Stop() if tc.wantJoin { - assert.True(tc.clusterJoiner.joinClusterCalled) + assert.Greater(tc.clusterJoiner.joinClusterCalled, 0) } else { - assert.False(tc.clusterJoiner.joinClusterCalled) + assert.Equal(0, tc.clusterJoiner.joinClusterCalled) + } + if tc.wantNumJoins > 0 { + assert.GreaterOrEqual(tc.clusterJoiner.joinClusterCalled, tc.wantNumJoins) } if tc.wantLock { assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked @@ -398,12 +416,17 @@ type issueJoinTicketAnswer struct { } type stubClusterJoiner struct { - joinClusterCalled bool + joinClusterCalled int + numBadCalls int joinClusterErr error } func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error { - j.joinClusterCalled = true + j.joinClusterCalled++ + if j.numBadCalls == 0 { + return nil + } + j.numBadCalls-- return j.joinClusterErr }