From 03fbcafe68e186f6c82aee81bea23f5a2a7b15df Mon Sep 17 00:00:00 2001
From: Markus Rudy <mr@edgeless.systems>
Date: Tue, 5 Mar 2024 09:14:01 +0100
Subject: [PATCH] bootstrapper: bounded retry of k8s join (#2968)

---
 .../internal/joinclient/joinclient.go         | 11 ++++++-
 .../internal/joinclient/joinclient_test.go    | 33 ++++++++++++++++---
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/bootstrapper/internal/joinclient/joinclient.go b/bootstrapper/internal/joinclient/joinclient.go
index 110b52a662..8f44fa1153 100644
--- a/bootstrapper/internal/joinclient/joinclient.go
+++ b/bootstrapper/internal/joinclient/joinclient.go
@@ -150,6 +150,7 @@ func (c *JoinClient) Start(cleaner cleaner) {
 				return
 			} else if isUnrecoverable(err) {
 				c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred")
+				// TODO(burgerdev): this should eventually lead to a full node reset
 				return
 			}
 			c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints")
@@ -310,7 +311,15 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse,
 		CACertHashes:      []string{ticket.DiscoveryTokenCaCertHash},
 	}
 
-	if err := c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log); err != nil {
+	// We currently cannot recover from any failure in this function. Joining the k8s cluster
+	// sometimes fails transiently, and we don't want to brick the node because of that.
+	for i := 0; i < 3; i++ {
+		err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log)
+		if err != nil {
+			c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
+		}
+	}
+	if err != nil {
 		return fmt.Errorf("joining Kubernetes cluster: %w", err)
 	}
 
diff --git a/bootstrapper/internal/joinclient/joinclient_test.go b/bootstrapper/internal/joinclient/joinclient_test.go
index 4684b2eb49..d22ed4fb97 100644
--- a/bootstrapper/internal/joinclient/joinclient_test.go
+++ b/bootstrapper/internal/joinclient/joinclient_test.go
@@ -62,6 +62,7 @@ func TestClient(t *testing.T) {
 		apiAnswers    []any
 		wantLock      bool
 		wantJoin      bool
+		wantNumJoins  int
 	}{
 		"on worker: metadata self: errors occur": {
 			role: role.Worker,
@@ -168,12 +169,26 @@ func TestClient(t *testing.T) {
 				listAnswer{instances: peers},
 				issueJoinTicketAnswer{},
 			},
-			clusterJoiner: &stubClusterJoiner{joinClusterErr: someErr},
+			clusterJoiner: &stubClusterJoiner{numBadCalls: -1, joinClusterErr: someErr},
 			nodeLock:      newFakeLock(),
 			disk:          &stubDisk{},
 			wantJoin:      true,
 			wantLock:      true,
 		},
+		"on control plane: joinCluster fails transiently": {
+			role: role.ControlPlane,
+			apiAnswers: []any{
+				selfAnswer{instance: controlSelf},
+				listAnswer{instances: peers},
+				issueJoinTicketAnswer{},
+			},
+			clusterJoiner: &stubClusterJoiner{numBadCalls: 1, joinClusterErr: someErr},
+			nodeLock:      newFakeLock(),
+			disk:          &stubDisk{},
+			wantJoin:      true,
+			wantLock:      true,
+			wantNumJoins:  2,
+		},
 		"on control plane: node already locked": {
 			role: role.ControlPlane,
 			apiAnswers: []any{
@@ -250,9 +265,12 @@ func TestClient(t *testing.T) {
 			client.Stop()
 
 			if tc.wantJoin {
-				assert.True(tc.clusterJoiner.joinClusterCalled)
+				assert.Greater(tc.clusterJoiner.joinClusterCalled, 0)
 			} else {
-				assert.False(tc.clusterJoiner.joinClusterCalled)
+				assert.Equal(0, tc.clusterJoiner.joinClusterCalled)
+			}
+			if tc.wantNumJoins > 0 {
+				assert.GreaterOrEqual(tc.clusterJoiner.joinClusterCalled, tc.wantNumJoins)
 			}
 			if tc.wantLock {
 				assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked
@@ -398,12 +416,17 @@ type issueJoinTicketAnswer struct {
 }
 
 type stubClusterJoiner struct {
-	joinClusterCalled bool
+	joinClusterCalled int
+	numBadCalls       int
 	joinClusterErr    error
 }
 
 func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error {
-	j.joinClusterCalled = true
+	j.joinClusterCalled++
+	if j.numBadCalls == 0 {
+		return nil
+	}
+	j.numBadCalls--
 	return j.joinClusterErr
 }