From 0654df0ea7b832a2888c6904970e0f2215908d08 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Fri, 29 Nov 2024 09:54:27 +0100 Subject: [PATCH] agent: Fix a bug where retry_join was not retrying (#24566) Co-authored-by: James Rasell --- .changelog/24561.txt | 3 ++ command/agent/retry_join.go | 11 ++++-- command/agent/retry_join_test.go | 63 ++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 .changelog/24561.txt diff --git a/.changelog/24561.txt b/.changelog/24561.txt new file mode 100644 index 00000000000..55507d5ebde --- /dev/null +++ b/.changelog/24561.txt @@ -0,0 +1,3 @@ +```release-note:bug +agent: Fixed a bug where `retry_join` gave up after a single failure, rather than retrying until max attempts had been reached +``` diff --git a/command/agent/retry_join.go b/command/agent/retry_join.go index 228dfc44dcd..af326b93fe4 100644 --- a/command/agent/retry_join.go +++ b/command/agent/retry_join.go @@ -158,7 +158,7 @@ func (r *retryJoiner) Validate(config *Config) error { return nil } -// retryJoin is used to handle retrying a join until it succeeds or all retries +// RetryJoin is used to handle retrying a join until it succeeds or all retries // are exhausted. func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { if len(serverJoin.RetryJoin) == 0 { @@ -176,13 +176,16 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) { var err error for _, addr := range serverJoin.RetryJoin { + + // If auto-discovery returns an error, log the error and + // fall-through, so we reach the retry logic and loop back around + // for another go. servers, err := r.autoDiscover.Addrs(addr, r.logger) if err != nil { r.logger.Error("discovering join addresses failed", "join_config", addr, "error", err) - return + } else { + addrs = append(addrs, servers...) } - - addrs = append(addrs, servers...) } if len(addrs) > 0 { diff --git a/command/agent/retry_join_test.go b/command/agent/retry_join_test.go index bbe12eed284..20eac30a65e 100644 --- a/command/agent/retry_join_test.go +++ b/command/agent/retry_join_test.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/go-netaddrs" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" @@ -261,6 +262,68 @@ func TestRetryJoin_Client(t *testing.T) { require.Equal(stubAddress, output[0]) } +// MockFailDiscover implements the DiscoverInterface interface and can be used +// for tests that want to purposely fail the discovery process. +type MockFailDiscover struct { + ReceivedConfig string +} + +func (m *MockFailDiscover) Addrs(cfg string, _ *golog.Logger) ([]string, error) { + return nil, fmt.Errorf("test: failed discovery %q", cfg) +} +func (m *MockFailDiscover) Help() string { return "" } +func (m *MockFailDiscover) Names() []string { + return []string{""} +} + +func TestRetryJoin_RetryMaxAttempts(t *testing.T) { + ci.Parallel(t) + + // Create an error channel to pass to the retry joiner. When the retry + // attempts have been exhausted, this channel is closed and our only way + // to test this apart from inspecting log entries. + errCh := make(chan struct{}) + + // Create a timeout to protect against problems within the test blocking + // for arbitrary long times. + timeout, timeoutStop := helper.NewSafeTimer(2 * time.Second) + defer timeoutStop() + + var output []string + + joiner := retryJoiner{ + autoDiscover: autoDiscover{goDiscover: &MockFailDiscover{}}, + clientJoin: func(s []string) (int, error) { + output = s + return 0, nil + }, + clientEnabled: true, + logger: testlog.HCLogger(t), + errCh: errCh, + } + + // Execute the retry join function in a routine, so we can track whether + // this returns and exits without close the error channel and thus + // indicating retry failure. + doneCh := make(chan struct{}) + + go func(doneCh chan struct{}) { + joiner.RetryJoin(&ServerJoin{RetryMaxAttempts: 1, RetryJoin: []string{"provider=foo"}}) + close(doneCh) + }(doneCh) + + // The main test; ensure error channel is closed, indicating the retry + // limit has been reached. + select { + case <-errCh: + must.Len(t, 0, output) + case <-doneCh: + t.Fatal("retry join completed without closing error channel") + case <-timeout.C: + t.Fatal("timeout reached without error channel close") + } +} + func TestRetryJoin_Validate(t *testing.T) { ci.Parallel(t) type validateExpect struct {