Skip to content

Commit

Permalink
Fix indefinite memory and CPU consumption when waiting fleet to be re…
Browse files Browse the repository at this point in the history
…ady (#5034)

* exit if timeout is reached while waiting for fleet server to start

* clarify exponential backoff behaviour

* add test

* add changelog

* fix changelog
  • Loading branch information
AndersonQ authored Jul 2, 2024
1 parent f858169 commit 8aa3477
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; a word indicating the component this changeset affects.
component:

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/5034

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/5033
10 changes: 9 additions & 1 deletion internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -751,8 +751,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat
msg := ""
msgCount := 0
backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff)

for {
backExp.Wait()
// if the timeout is reached, no response was sent on `res`, therefore
// send an error
if !backExp.Wait() {
resChan <- waitResult{err: fmt.Errorf(
"timed out waiting for Fleet Server to start after %s",
timeout)}
}

state, err := getDaemonState(innerCtx)
if errors.Is(err, context.Canceled) {
resChan <- waitResult{err: err}
Expand Down
22 changes: 22 additions & 0 deletions internal/pkg/agent/cmd/enroll_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) {
}
}

func TestWaitForFleetServer_timeout(t *testing.T) {
log, _ := logger.NewTesting("TestWaitForFleetServer_timeout")
timeout := 5 * time.Second
testTimeout := 2 * timeout

ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
var got string
var err error
require.Eventuallyf(t,
func() bool {
got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout)
return true
},
testTimeout,
500*time.Millisecond,
"waitForFleetServer never returned")

assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken")
assert.Error(t, err, "waitForFleetServer should have returned an error")
}

func withServer(
m func(t *testing.T) *http.ServeMux,
test func(t *testing.T, host string),
Expand Down
5 changes: 4 additions & 1 deletion internal/pkg/core/backoff/exponential.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration {
return nextWait
}

// Wait block until either the timer is completed or channel is done.
// Wait blocks until either the exponential backoff timer is completed or the
// done channel is closed.
// Wait returns true until done is closed. When done is closed, wait returns
// immediately, therefore callers should always check the return value.
func (b *ExpBackoff) Wait() bool {
b.duration = b.NextWait()

Expand Down

0 comments on commit 8aa3477

Please sign in to comment.