diff --git a/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml new file mode 100644 index 00000000000..c97a91daaa5 --- /dev/null +++ b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/5034 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/5033 diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index edaff9ec3d3..7a858fca6ac 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -751,8 +751,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat msg := "" msgCount := 0 backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff) + for { - backExp.Wait() + // if the timeout is reached, no response was sent on `res`, therefore + // send an error + if !backExp.Wait() { + resChan <- waitResult{err: fmt.Errorf( + "timed out waiting for Fleet Server to start after %s", + timeout)} + } + state, err := getDaemonState(innerCtx) if errors.Is(err, context.Canceled) { resChan <- waitResult{err: err} diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 8b7a88232f8..a9597336760 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) { } } +func TestWaitForFleetServer_timeout(t *testing.T) { + log, _ := logger.NewTesting("TestWaitForFleetServer_timeout") + timeout := 5 * time.Second + testTimeout := 2 * timeout + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + var got string + var err error + require.Eventuallyf(t, + func() bool { + got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout) + return true + }, + testTimeout, + 500*time.Millisecond, + "waitForFleetServer never returned") + + assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken") + assert.Error(t, err, "waitForFleetServer should have returned an error") +} + func withServer( m func(t *testing.T) *http.ServeMux, test func(t *testing.T, host string), diff --git a/internal/pkg/core/backoff/exponential.go b/internal/pkg/core/backoff/exponential.go index 51b5b4e0cb5..a959521c457 100644 --- a/internal/pkg/core/backoff/exponential.go +++ b/internal/pkg/core/backoff/exponential.go @@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration { return nextWait } -// Wait block until either the timer is completed or channel is done. +// Wait blocks until either the exponential backoff timer is completed or the +// done channel is closed. +// Wait returns true until done is closed. When done is closed, wait returns +// immediately, therefore callers should always check the return value. func (b *ExpBackoff) Wait() bool { b.duration = b.NextWait()