From 515bf4720c342e790ef6355613447ac1929d6487 Mon Sep 17 00:00:00 2001 From: Anderson Queriroz Date: Thu, 5 Oct 2023 06:56:31 +0200 Subject: [PATCH 01/24] fix install/enroll cmd not failing when agent restart fails * surface errors that might occur during enroll * fail install command if agent cannot be restarted * do not print success message if there was an enroll error. Print an error message and the error instead * add logs to show the different enroll attempts * add more context t errors * refactor internal/pkg/agent/install/perms_unix.go and add more context to errors restore main version * ignore agent restart error on enroll tests as there is no agent to be restarted * daemonReloadWithBackoff does not retry on context deadline exceeded --- ...-Surface-errors-during-Agent's-enroll.yaml | 32 +++++++++ dev-tools/mage/godaemon.go | 2 +- internal/pkg/agent/cmd/enroll.go | 2 +- internal/pkg/agent/cmd/enroll_cmd.go | 67 ++++++++++++----- internal/pkg/agent/cmd/enroll_cmd_test.go | 72 +++++++++++++------ internal/pkg/agent/cmd/install.go | 4 +- internal/pkg/agent/install/perms_unix.go | 29 +++++--- 7 files changed, 154 insertions(+), 54 deletions(-) create mode 100644 changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml new file mode 100644 index 00000000000..f8361f99433 --- /dev/null +++ b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Surface errors during Agent's enroll process, failing if any happens. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: install/enroll + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3207 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/dev-tools/mage/godaemon.go b/dev-tools/mage/godaemon.go index 90960bfe69f..40d5e94564b 100644 --- a/dev-tools/mage/godaemon.go +++ b/dev-tools/mage/godaemon.go @@ -21,7 +21,7 @@ var ( } ) -// BuildGoDaemon builds the go-deamon binary. +// BuildGoDaemon builds the go-daemon binary. func BuildGoDaemon() error { if GOOS != "linux" { return errors.New("go-daemon only builds for linux") diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 1bce5f7e547..adaa278f32f 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - var fixPermissions bool = fromInstall + fixPermissions := fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 37540ae5249..e4488fd15c8 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -174,7 +174,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates an new enrollment and accept a custom store. +// newEnrollCmdWithStore creates a new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -189,10 +189,11 @@ func newEnrollCmdWithStore( }, nil } -// Execute tries to enroll the agent into Fleet. +// Execute enrolls the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what + span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -237,7 +238,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterwards requests will be sent to :8221 + // however when the agent is running afterward requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -258,7 +259,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return errors.New(err, "fail to enroll") + return fmt.Errorf("fail to enroll: %w", err) } if c.options.FixPermissions { @@ -269,17 +270,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + if err != nil { + fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) + } else { + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + } }() if c.agentProc == nil { - if err := c.daemonReload(ctx); err != nil { - c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) - } else { - c.log.Info("Successfully triggered restart on running Elastic Agent.") + if err = c.daemonReloadWithBackoff(ctx); err != nil { + c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) + return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) } + + c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } + c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -445,24 +452,35 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload daemon: %w", err) + } if err == nil { return nil } signal := make(chan struct{}) + defer close(signal) backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) - for i := 5; i >= 0; i-- { + for i := 0; i < 5; i++ { backExp.Wait() c.log.Info("Retrying to restart...") err = c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + i+1, err) + } if err == nil { - break + return nil } } - close(signal) - return err + return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err) } func (c *enrollCmd) daemonReload(ctx context.Context) error { @@ -480,8 +498,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) + if err == nil { + return nil + } + + const deadline = 10 * time.Minute + const frequency = 60 * time.Second + + c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", + deadline, + frequency, + c.client.URI()) signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, frequency, deadline) for { retry := false @@ -500,7 +530,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } - close(signal) return err } @@ -549,8 +578,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return err + return fmt.Errorf( + "failed creating fleet-server bootstrap config: %w", err) } + // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -570,11 +601,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return err + return fmt.Errorf("yamlToReader failed: %w", err) } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return err + return fmt.Errorf("failed to store agent config: %w", err) } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 0b1e7d5d4ee..228d876743e 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -16,8 +16,11 @@ import ( "os" "runtime" "strconv" + "strings" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -159,14 +162,23 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + require.ErrorContainsf(t, err, + "could not reload agent daemon, unable to trigger restart", + "enroll command returned an unexpected error") + require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(), + "it should fail only due to %q", context.DeadlineExceeded) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -216,16 +228,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.NoError(t, err) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -275,16 +295,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index a1415d0e59b..002a8d1b98a 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -169,7 +169,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") + fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") return nil } } @@ -232,6 +232,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } + + fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index 578c2af0a30..c720643e729 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,6 +8,7 @@ package install import ( "errors" + "fmt" "io/fs" "os" "path/filepath" @@ -18,18 +19,24 @@ import ( // FixPermissions fixes the permissions so only root:root is the owner and no world read-able permissions func FixPermissions(topPath string, ownership utils.FileOwner) error { return filepath.Walk(topPath, func(name string, info fs.FileInfo, err error) error { - if err == nil { - // all files should be owned by uid:gid - // uses `os.Lchown` so the symlink is updated to have the permissions - err = os.Lchown(name, ownership.UID, ownership.GID) - if err != nil { - return err - } - // remove any world permissions from the file - err = os.Chmod(name, info.Mode().Perm()&0770) - } else if errors.Is(err, fs.ErrNotExist) { + if errors.Is(err, fs.ErrNotExist) { return nil } - return err + if err != nil { + return fmt.Errorf("walk on %q failed: %w", topPath, err) + } + + // all files should be owned by uid:gid + // uses `os.Lchown` so the symlink is updated to have the permissions + if err := os.Lchown(name, ownership.UID, ownership.GID); err != nil { + return fmt.Errorf("cannot update ownership of %q: %w", topPath, err) + } + + // remove any world permissions from the file + if err := os.Chmod(name, info.Mode().Perm()&0770); err != nil { + return fmt.Errorf("could not update permissions of %q: %w:", topPath, err) + } + + return nil }) } From dd9a5c76f6366aa33aab6551afb48a47d25cd284 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Fri, 24 Nov 2023 19:18:00 +0100 Subject: [PATCH 02/24] Do not reload the Agent daemon if enrolling from a container The enroll command would always try to restart the daemon, however when enrolling as part of the container command, there is no running daemon to reload. This commit adds a CLI flag, --skip-daemon-reload, to the enroll command to skip the reloading step, the container command now makes use of this flag. --- ...-Agent's-enroll-process,-failing-if-any-happens..yaml} | 8 ++++---- internal/pkg/agent/cmd/container.go | 1 + internal/pkg/agent/cmd/enroll.go | 7 +++++++ internal/pkg/agent/cmd/enroll_cmd.go | 3 ++- internal/pkg/agent/install/perms_unix.go | 2 +- 5 files changed, 15 insertions(+), 6 deletions(-) rename changelog/fragments/{1693403216-Surface-errors-during-Agent's-enroll.yaml => 1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml} (85%) diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml similarity index 85% rename from changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml rename to changelog/fragments/1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml index f8361f99433..916920a83e1 100644 --- a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml +++ b/changelog/fragments/1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml @@ -18,15 +18,15 @@ summary: Surface errors during Agent's enroll process, failing if any happens. # NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. #description: -# Affected component; a word indicating the component this changeset affects. -component: install/enroll +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent # PR URL; optional; the PR number that added the changeset. # If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. # NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. # Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3207 +pr: https://github.com/elastic/elastic-agent/pull/3815/ # Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). # If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 +issue: https://github.com/elastic/elastic-agent/issues/3664 diff --git a/internal/pkg/agent/cmd/container.go b/internal/pkg/agent/cmd/container.go index 321e2fd07b3..7d3fc1df022 100644 --- a/internal/pkg/agent/cmd/container.go +++ b/internal/pkg/agent/cmd/container.go @@ -397,6 +397,7 @@ func buildEnrollArgs(cfg setupConfig, token string, policyID string) ([]string, "--path.home", paths.Top(), // --path.home actually maps to paths.Top() "--path.config", paths.Config(), "--path.logs", paths.Logs(), + "--skip-daemon-reload", } if paths.Downloads() != "" { args = append(args, "--path.downloads", paths.Downloads()) diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index adaa278f32f..8bc4aa48774 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -75,6 +75,7 @@ func addEnrollFlags(cmd *cobra.Command) { cmd.Flags().BoolP("delay-enroll", "", false, "Delays enrollment to occur on first start of the Elastic Agent service") cmd.Flags().DurationP("daemon-timeout", "", 0, "Timeout waiting for Elastic Agent daemon") cmd.Flags().DurationP("fleet-server-timeout", "", 0, "Timeout waiting for Fleet Server to be ready to start enrollment") + cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling") cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags") } @@ -141,6 +142,7 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string delayEnroll, _ := cmd.Flags().GetBool("delay-enroll") daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout") fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout") + skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload") fTags, _ := cmd.Flags().GetStringSlice("tag") args := []string{} if url != "" { @@ -249,6 +251,9 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string args = append(args, "--fleet-server-es-insecure") } + if skipDaemonReload { + args = append(args, "--skip-daemon-reload") + } for _, v := range fTags { args = append(args, "--tag", v) } @@ -338,6 +343,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { delayEnroll, _ := cmd.Flags().GetBool("delay-enroll") daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout") fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout") + skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload") tags, _ := cmd.Flags().GetStringSlice("tag") caStr, _ := cmd.Flags().GetString("certificate-authorities") @@ -370,6 +376,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { ProxyHeaders: mapFromEnvList(proxyHeaders), DelayEnroll: delayEnroll, DaemonTimeout: daemonTimeout, + SkipDaemonRestart: skipDaemonReload, Tags: tags, FleetServer: enrollCmdFleetServerOption{ ConnStr: fServer, diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index e4488fd15c8..7f365b533be 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -115,6 +115,7 @@ type enrollCmdOption struct { DelayEnroll bool `yaml:"-"` FleetServer enrollCmdFleetServerOption `yaml:"-"` SkipCreateSecret bool `yaml:"-"` + SkipDaemonRestart bool `yaml:"-"` Tags []string `yaml:"omitempty"` } @@ -277,7 +278,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } }() - if c.agentProc == nil { + if c.agentProc == nil && !c.options.SkipDaemonRestart { if err = c.daemonReloadWithBackoff(ctx); err != nil { c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index c720643e729..8cafc79fe8b 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -34,7 +34,7 @@ func FixPermissions(topPath string, ownership utils.FileOwner) error { // remove any world permissions from the file if err := os.Chmod(name, info.Mode().Perm()&0770); err != nil { - return fmt.Errorf("could not update permissions of %q: %w:", topPath, err) + return fmt.Errorf("could not update permissions of %q: %w", topPath, err) } return nil From cea3c7a8ff1fbceecc8d4a1bb63f98ce4ee1ec18 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 27 Nov 2023 18:19:07 +0100 Subject: [PATCH 03/24] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Paolo Chilà --- internal/pkg/agent/cmd/enroll_cmd.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 7f365b533be..f70589b6353 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -453,14 +453,13 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) - if err != nil && - (errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled)) { - return fmt.Errorf("could not reload daemon: %w", err) - } if err == nil { return nil } + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { + return fmt.Errorf("could not reload daemon: %w", err) + } signal := make(chan struct{}) defer close(signal) From b1d54bae3fb7a14f82f76fd6b2f745862fc38c5e Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 28 Nov 2023 17:24:31 +0100 Subject: [PATCH 04/24] PR improvements --- internal/pkg/agent/cmd/enroll.go | 2 ++ internal/pkg/agent/cmd/enroll_cmd.go | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 8bc4aa48774..4b7285314a9 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -77,6 +77,8 @@ func addEnrollFlags(cmd *cobra.Command) { cmd.Flags().DurationP("fleet-server-timeout", "", 0, "Timeout waiting for Fleet Server to be ready to start enrollment") cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling") cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags") + + cmd.Flags().MarkHidden("skip-daemon-reload") } func validateEnrollFlags(cmd *cobra.Command) error { diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index f70589b6353..e7049904269 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -457,7 +457,7 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { return nil } if errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) { + errors.Is(err, context.Canceled) { return fmt.Errorf("could not reload daemon: %w", err) } @@ -468,16 +468,16 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { for i := 0; i < 5; i++ { backExp.Wait() c.log.Info("Retrying to restart...") + err = c.daemonReload(ctx) - if err != nil && - (errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled)) { - return fmt.Errorf("could not reload daemon after %d retries: %w", - i+1, err) - } if err == nil { return nil } + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + i+1, err) + } } return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err) From 186ad30dc07d523eea2d30a8fdae8173306577d8 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 28 Nov 2023 17:25:09 +0100 Subject: [PATCH 05/24] Add integration test --- testing/integration/container_cmd_test.go | 102 ++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 testing/integration/container_cmd_test.go diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go new file mode 100644 index 00000000000..ac5ab416482 --- /dev/null +++ b/testing/integration/container_cmd_test.go @@ -0,0 +1,102 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "context" + "fmt" + "os" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" +) + +func TestContainerCMD(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: true, + // This test runs the command we use when executing inside a container + // which leaves files under /usr/share/elastic-agent. Run it isolated + // to avoid interfering with other tests and better simulate a container + // environment we run it in isolation + Isolate: true, + }) + ctx := context.Background() + + agentFixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + createPolicyReq := kibana.AgentPolicy{ + Name: fmt.Sprintf("test-policy-enroll-%d", time.Now().Unix()), + Namespace: info.Namespace, + Description: "test policy for agent enrollment", + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + AgentFeatures: []map[string]interface{}{ + { + "name": "test_enroll", + "enabled": true, + }, + }, + } + + // Create policy + policy, err := info.KibanaClient.CreatePolicy(ctx, createPolicyReq) + if err != nil { + t.Fatalf("could not create Agent Policy: %s", err) + } + + // Create enrollment API key + createEnrollmentAPIKeyReq := kibana.CreateEnrollmentAPIKeyRequest{ + PolicyID: policy.ID, + } + + t.Logf("Creating enrollment API key...") + enrollmentToken, err := info.KibanaClient.CreateEnrollmentAPIKey(ctx, createEnrollmentAPIKeyReq) + if err != nil { + t.Fatalf("unable to create enrolment API key: %s", err) + } + + fleetURL, err := fleettools.DefaultURL(info.KibanaClient) + if err != nil { + t.Fatalf("could not get Fleet URL: %s", err) + } + + cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"}) + cmd.Env = append(os.Environ(), []string{ + "FLEET_ENROLL=1", + "FLEET_URL=" + fleetURL, + "FLEET_ENROLLMENT_TOKEN=" + enrollmentToken.APIKey, + }...) + + t.Logf(">> running binary with: %v", cmd.Args) + output, err := cmd.CombinedOutput() + if err != nil { + t.Errorf("error running container cmd: %s", err) + t.Log("Container command output:") + t.Log(string(output)) + t.FailNow() + } + + require.Eventually(t, func() bool { + healthy, err := agentFixture.IsHealthy(ctx) + if err != nil { + t.Logf("error checking agent health, retrying soon. Err: %s", err) + } + return healthy + }, + 3*time.Minute, 10*time.Second, "Elastic-Agent did not report healthy", + ) +} From b8ac7220d48cfe8f9dc87d83e270a8dbdc28adfa Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 28 Nov 2023 17:52:14 +0100 Subject: [PATCH 06/24] make lint happy --- internal/pkg/agent/cmd/enroll.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 4b7285314a9..95a13341a87 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -78,7 +78,7 @@ func addEnrollFlags(cmd *cobra.Command) { cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling") cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags") - cmd.Flags().MarkHidden("skip-daemon-reload") + cmd.Flags().MarkHidden("skip-daemon-reload") //nolint:errcheck // an error is only returned if the flag does not exist. } func validateEnrollFlags(cmd *cobra.Command) error { From 6926fa749188f5e1e27be0aaf0559cba817141af Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Tue, 5 Dec 2023 19:46:27 +0100 Subject: [PATCH 07/24] PR improvements --- internal/pkg/agent/cmd/enroll_cmd.go | 43 +++++++++------------- internal/pkg/agent/cmd/enroll_cmd_test.go | 45 ++++++++--------------- 2 files changed, 33 insertions(+), 55 deletions(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index e7049904269..4ca812bf1d8 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -15,10 +15,6 @@ import ( "strings" "time" - "github.com/elastic/elastic-agent/pkg/utils" - - "github.com/elastic/elastic-agent/pkg/control/v2/client" - "go.elastic.co/apm" "gopkg.in/yaml.v2" @@ -42,8 +38,10 @@ import ( fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/internal/pkg/remote" + "github.com/elastic/elastic-agent/pkg/control/v2/client" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/core/process" + "github.com/elastic/elastic-agent/pkg/utils" ) const ( @@ -452,35 +450,30 @@ func (c *enrollCmd) prepareFleetTLS() error { } func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { - err := c.daemonReload(ctx) - if err == nil { - return nil - } - if errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) { - return fmt.Errorf("could not reload daemon: %w", err) - } - signal := make(chan struct{}) defer close(signal) - backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) + backExp := backoff.NewExpBackoff(signal, 1*time.Second, 1*time.Minute) + var lastErr error for i := 0; i < 5; i++ { - backExp.Wait() - c.log.Info("Retrying to restart...") + attempt := i + + c.log.Infof("Restarting agent daemon, attempt %d", attempt) + if err := c.daemonReload(ctx); err == nil { + // If the context was cancelled, return early + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + attempt, err) + } + lastErr = err - err = c.daemonReload(ctx) - if err == nil { - return nil - } - if errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) { - return fmt.Errorf("could not reload daemon after %d retries: %w", - i+1, err) + c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String()) + backExp.Wait() } } - return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err) + return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", lastErr) } func (c *enrollCmd) daemonReload(ctx context.Context) error { diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 228d876743e..361a9d8b9ae 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -16,7 +16,6 @@ import ( "os" "runtime" "strconv" - "strings" "testing" "time" @@ -155,6 +154,7 @@ func TestEnroll(t *testing.T) { EnrollAPIKey: "my-enrollment-api-key", UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -164,16 +164,11 @@ func TestEnroll(t *testing.T) { streams, _, _, _ := cli.NewTestingIOStreams() ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() - err = cmd.Execute(ctx, streams) - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - require.ErrorContainsf(t, err, - "could not reload agent daemon, unable to trigger restart", - "enroll command returned an unexpected error") - require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(), - "it should fail only due to %q", context.DeadlineExceeded) + if err := cmd.Execute(ctx, streams); err != nil { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + config, err := readConfig(store.Content) require.NoError(t, err) @@ -221,6 +216,7 @@ func TestEnroll(t *testing.T) { Insecure: true, UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -230,22 +226,18 @@ func TestEnroll(t *testing.T) { streams, _, _, _ := cli.NewTestingIOStreams() ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() - err = cmd.Execute(ctx, streams) - if err != nil && - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - !strings.Contains(err.Error(), - "could not reload agent daemon, unable to trigger restart") { + + if err := cmd.Execute(ctx, streams); err != nil { t.Fatalf("enrrol coms returned and unexpected error: %v", err) } assert.True(t, store.Called) - config, err := readConfig(store.Content) - assert.NoError(t, err) - assert.Equal(t, "my-access-api-key", config.AccessAPIKey) - assert.Equal(t, host, config.Client.Host) + config, err := readConfig(store.Content) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey, + "The stored 'Access API Key' must be the same returned by Fleet-Server") + assert.Equal(t, host, config.Client.Host, + "The stored Fleet-Server host must match the one used during enrol") }, )) @@ -288,6 +280,7 @@ func TestEnroll(t *testing.T) { Insecure: true, UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -298,15 +291,7 @@ func TestEnroll(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() err = cmd.Execute(ctx, streams) - - if err != nil && - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - !strings.Contains(err.Error(), - "could not reload agent daemon, unable to trigger restart") { - t.Fatalf("enrrol coms returned and unexpected error: %v", err) - } + require.NoError(t, err, "enroll command should return no error") assert.True(t, store.Called) config, err := readConfig(store.Content) From 9cc120bb829c44ff0239646e3b53ab94eed5625b Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 6 Dec 2023 09:18:54 +0100 Subject: [PATCH 08/24] Fix after rebase --- testing/integration/container_cmd_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index ac5ab416482..40fa73437c6 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -29,7 +29,7 @@ func TestContainerCMD(t *testing.T) { // which leaves files under /usr/share/elastic-agent. Run it isolated // to avoid interfering with other tests and better simulate a container // environment we run it in isolation - Isolate: true, + Group: "container", }) ctx := context.Background() From 2cdf80feae1246291b23acef6ac9a16a77678f8b Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 6 Dec 2023 18:37:28 +0100 Subject: [PATCH 09/24] Fix some issues --- internal/pkg/agent/cmd/enroll_cmd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 4ca812bf1d8..e0b9eeb6f68 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -459,7 +459,7 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { attempt := i c.log.Infof("Restarting agent daemon, attempt %d", attempt) - if err := c.daemonReload(ctx); err == nil { + if err := c.daemonReload(ctx); err != nil { // If the context was cancelled, return early if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { From a54f2672914f0158b10ff4a3972d4d97f95670d1 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 7 Dec 2023 09:31:37 +0100 Subject: [PATCH 10/24] more PR improvments --- internal/pkg/agent/cmd/enroll_cmd_test.go | 2 +- testing/integration/container_cmd_test.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 361a9d8b9ae..2d28db5a4f8 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -232,8 +232,8 @@ func TestEnroll(t *testing.T) { } assert.True(t, store.Called) - config, err := readConfig(store.Content) + require.NoError(t, err, "readConfig returned an error") assert.Equal(t, "my-access-api-key", config.AccessAPIKey, "The stored 'Access API Key' must be the same returned by Fleet-Server") assert.Equal(t, host, config.Client.Host, diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index 40fa73437c6..36bd5d7bcb3 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -75,11 +75,11 @@ func TestContainerCMD(t *testing.T) { } cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"}) - cmd.Env = append(os.Environ(), []string{ + cmd.Env = append(os.Environ(), "FLEET_ENROLL=1", - "FLEET_URL=" + fleetURL, - "FLEET_ENROLLMENT_TOKEN=" + enrollmentToken.APIKey, - }...) + "FLEET_URL="+fleetURL, + "FLEET_ENROLLMENT_TOKEN="+enrollmentToken.APIKey, + ) t.Logf(">> running binary with: %v", cmd.Args) output, err := cmd.CombinedOutput() From e82ad16b122807d9d4f942f46f8aaf0831b34b30 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 11 Dec 2023 15:53:46 +0100 Subject: [PATCH 11/24] Fix enroll command --- internal/pkg/agent/cmd/enroll_cmd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index e0b9eeb6f68..ad124debcbf 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -276,7 +276,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } }() - if c.agentProc == nil && !c.options.SkipDaemonRestart { + if c.agentProc != nil && !c.options.SkipDaemonRestart { if err = c.daemonReloadWithBackoff(ctx); err != nil { c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) From 651fdf2dd4ba514ef4b034198cb9f06856fcc527 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Wed, 13 Dec 2023 20:46:29 +0100 Subject: [PATCH 12/24] Fix TestContainterCMD --- testing/integration/container_cmd_test.go | 41 ++++++++++++++++------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index 36bd5d7bcb3..9733d95fcfe 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -10,6 +10,7 @@ import ( "context" "fmt" "os" + "strings" "testing" "time" @@ -74,7 +75,23 @@ func TestContainerCMD(t *testing.T) { t.Fatalf("could not get Fleet URL: %s", err) } + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"}) + if err != nil { + t.Fatalf("could not prepare agent command: %s", err) + } + + t.Cleanup(func() { + t.Log(">> cleaning up: killing the Elastic-Agent process") + if err := cmd.Process.Kill(); err != nil { + t.Fatalf("could not kill Elastic-Agent process: %s", err) + } + }) + + agentOutput := strings.Builder{} + cmd.Stderr = &agentOutput + cmd.Stdout = &agentOutput cmd.Env = append(os.Environ(), "FLEET_ENROLL=1", "FLEET_URL="+fleetURL, @@ -82,21 +99,21 @@ func TestContainerCMD(t *testing.T) { ) t.Logf(">> running binary with: %v", cmd.Args) - output, err := cmd.CombinedOutput() - if err != nil { - t.Errorf("error running container cmd: %s", err) - t.Log("Container command output:") - t.Log(string(output)) - t.FailNow() + if err := cmd.Start(); err != nil { + t.Fatalf("error running container cmd: %s", err) } - require.Eventually(t, func() bool { - healthy, err := agentFixture.IsHealthy(ctx) - if err != nil { - t.Logf("error checking agent health, retrying soon. Err: %s", err) - } + require.Eventuallyf(t, func() bool { + // This will returns errors until it connects to the agent, + // they're mostly noise because until the agent starts running + // we will get connection errors. If the test fails + // the agent logs will be present in the error message + // which should help to explain why the agent was not + // healthy. + healthy, _ := agentFixture.IsHealthy(ctx) return healthy }, - 3*time.Minute, 10*time.Second, "Elastic-Agent did not report healthy", + 3*time.Minute, time.Second, + "Elastic-Agent did not report healthy. Agent logs\n%s", &agentOutput, ) } From 5ff02a9af1e87fefc6993496db98dbbb6add3ebf Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 14 Dec 2023 15:33:21 +0100 Subject: [PATCH 13/24] Fix implementation --- internal/pkg/agent/cmd/enroll_cmd.go | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index ad124debcbf..d3de412ba44 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -276,7 +276,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } }() - if c.agentProc != nil && !c.options.SkipDaemonRestart { + if c.agentProc == nil && !c.options.SkipDaemonRestart { if err = c.daemonReloadWithBackoff(ctx); err != nil { c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) @@ -459,18 +459,22 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { attempt := i c.log.Infof("Restarting agent daemon, attempt %d", attempt) - if err := c.daemonReload(ctx); err != nil { - // If the context was cancelled, return early - if errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) { - return fmt.Errorf("could not reload daemon after %d retries: %w", - attempt, err) - } - lastErr = err + err := c.daemonReload(ctx) + if err == nil { + return nil + } - c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String()) - backExp.Wait() + // If the context was cancelled, return early + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + attempt, err) } + lastErr = err + + c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String()) + backExp.Wait() + } return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", lastErr) From 4596545eb14417397e16d9116fa891185139c5da Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 14 Dec 2023 15:37:27 +0100 Subject: [PATCH 14/24] Remove flaky integration test assertion Asserting there are no errors in the logs from Elastic-Agent and all Beats is flaky and does not ensure the Elastic-Agent is working correctly. The test already assert the healthy of all components, so there is no need to look in the logs. The number of exceptions this assertion for no log errors is already an example of how fragile this is. The Elastic-Agent life cycle is complex and some transient errors are expected, as the code evolves those errors or messages will change, making assertion on them flaky. --- testing/integration/logs_ingestion_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/integration/logs_ingestion_test.go b/testing/integration/logs_ingestion_test.go index fa879ffafef..e8a86df4a0b 100644 --- a/testing/integration/logs_ingestion_test.go +++ b/testing/integration/logs_ingestion_test.go @@ -151,7 +151,7 @@ func testMonitoringLogsAreShipped( } require.Empty(t, docs.Hits.Hits) - // Stage 4: Make sure we have message confirming central management is running + // Stage 3: Make sure we have message confirming central management is running t.Log("Making sure we have message confirming central management is running") docs = findESDocs(t, func() (estools.Documents, error) { return estools.FindMatchingLogLines(ctx, info.ESClient, info.Namespace, @@ -159,7 +159,7 @@ func testMonitoringLogsAreShipped( }) require.NotZero(t, len(docs.Hits.Hits)) - // Stage 5: verify logs from the monitoring components are not sent to the output + // Stage 4: verify logs from the monitoring components are not sent to the output t.Log("Check monitoring logs") hostname, err := os.Hostname() if err != nil { From 02c086aa569a9da36b1382ac096284358dedefd9 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 14 Dec 2023 16:26:08 +0100 Subject: [PATCH 15/24] rename fragment --- ...ors-during-Agent_s-enroll-process-failing-if-any-happens.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changelog/fragments/{1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml => 1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml} (100%) diff --git a/changelog/fragments/1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml b/changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml similarity index 100% rename from changelog/fragments/1700851577-Surface-errors-during-Agent's-enroll-process,-failing-if-any-happens..yaml rename to changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml From 37b9b91d3f4ac5b8e89328c152f2179a4fed6242 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Mon, 18 Dec 2023 11:53:11 +0100 Subject: [PATCH 16/24] fix tests --- testing/integration/container_cmd_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index 9733d95fcfe..7c42ff47adc 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -26,6 +26,9 @@ func TestContainerCMD(t *testing.T) { Stack: &define.Stack{}, Local: false, Sudo: true, + OS: []define.OS{ + {Type: define.Linux}, + }, // This test runs the command we use when executing inside a container // which leaves files under /usr/share/elastic-agent. Run it isolated // to avoid interfering with other tests and better simulate a container From 8ed90d3eca16140c7939d585b8fd4bd702537937 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 21 Dec 2023 08:50:17 +0100 Subject: [PATCH 17/24] Fix tests after rebase --- testing/integration/container_cmd_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index 7c42ff47adc..90e1b844a26 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -73,7 +73,7 @@ func TestContainerCMD(t *testing.T) { t.Fatalf("unable to create enrolment API key: %s", err) } - fleetURL, err := fleettools.DefaultURL(info.KibanaClient) + fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient) if err != nil { t.Fatalf("could not get Fleet URL: %s", err) } From 3a5719dd4f5744567a2e7500674a1122b5cf5f35 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 21 Dec 2023 16:23:38 +0100 Subject: [PATCH 18/24] foo From 13fd58771d1014412daaf7ef3aba62442b052257 Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 21 Dec 2023 19:52:24 +0100 Subject: [PATCH 19/24] Fix a possible panic --- testing/integration/container_cmd_test.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index 90e1b844a26..f0fd7bfd5c9 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -86,10 +86,14 @@ func TestContainerCMD(t *testing.T) { } t.Cleanup(func() { - t.Log(">> cleaning up: killing the Elastic-Agent process") - if err := cmd.Process.Kill(); err != nil { - t.Fatalf("could not kill Elastic-Agent process: %s", err) + if cmd.Process != nil { + t.Log(">> cleaning up: killing the Elastic-Agent process") + if err := cmd.Process.Kill(); err != nil { + t.Fatalf("could not kill Elastic-Agent process: %s", err) + } + return } + t.Log(">> cleaning up: no process to kill") }) agentOutput := strings.Builder{} From a7a0a5fa38a6d86e7ad9c66fec6ad04bdbe30266 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Wed, 27 Dec 2023 17:15:59 +0100 Subject: [PATCH 20/24] improve error message on TestContainerCMD --- testing/integration/container_cmd_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index f0fd7bfd5c9..edd71d54d1f 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -111,16 +111,18 @@ func TestContainerCMD(t *testing.T) { } require.Eventuallyf(t, func() bool { + var healthy bool // This will returns errors until it connects to the agent, // they're mostly noise because until the agent starts running // we will get connection errors. If the test fails // the agent logs will be present in the error message // which should help to explain why the agent was not // healthy. - healthy, _ := agentFixture.IsHealthy(ctx) + healthy, err = agentFixture.IsHealthy(ctx) return healthy }, 3*time.Minute, time.Second, - "Elastic-Agent did not report healthy. Agent logs\n%s", &agentOutput, + "Elastic-Agent did not report healthy. Agent status error: %v, Agent logs\n%s", + err, &agentOutput, ) } From 6f4bf64242b975f964f908a8c8c4dd455c930a28 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Wed, 27 Dec 2023 18:34:35 +0100 Subject: [PATCH 21/24] increase test timeout --- testing/integration/container_cmd_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index edd71d54d1f..d3649e529ae 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -121,8 +121,8 @@ func TestContainerCMD(t *testing.T) { healthy, err = agentFixture.IsHealthy(ctx) return healthy }, - 3*time.Minute, time.Second, - "Elastic-Agent did not report healthy. Agent status error: %v, Agent logs\n%s", + 5*time.Minute, time.Second, + "Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s", err, &agentOutput, ) } From b633108020d437846a232250747ee452de3b48c3 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Wed, 27 Dec 2023 20:23:30 +0100 Subject: [PATCH 22/24] add debug logs --- internal/pkg/agent/cmd/platformcheck_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/platformcheck_test.go b/internal/pkg/agent/cmd/platformcheck_test.go index f1abbbc0ddd..2e3c5d8e8b1 100644 --- a/internal/pkg/agent/cmd/platformcheck_test.go +++ b/internal/pkg/agent/cmd/platformcheck_test.go @@ -29,13 +29,15 @@ func TestCheckPlatformCompat(t *testing.T) { cmd.Stderr = os.Stderr cmd.Env = append(os.Environ(), "GOARCH=386") require.NoError(t, cmd.Run(), "failed to compile test helper") + t.Logf("compiled test binary %q", helper) // run test helper cmd = exec.Command(helper, "-test.v", "-test.run", "TestHelper") cmd.Env = []string{"GO_USE_HELPER=1"} + t.Logf("running %q", cmd.Args) output, err := cmd.Output() if err != nil { - t.Logf("32bit binary tester failed.\n Output: %s", output) + t.Fatalf("32bit binary tester failed.\n Output: %s", output) } } From 8424f8c3f56bb2065581630979831e658b176d78 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Thu, 28 Dec 2023 09:11:17 +0100 Subject: [PATCH 23/24] restore logf and add error to message --- internal/pkg/agent/cmd/platformcheck_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/pkg/agent/cmd/platformcheck_test.go b/internal/pkg/agent/cmd/platformcheck_test.go index 2e3c5d8e8b1..857a486ef19 100644 --- a/internal/pkg/agent/cmd/platformcheck_test.go +++ b/internal/pkg/agent/cmd/platformcheck_test.go @@ -37,7 +37,8 @@ func TestCheckPlatformCompat(t *testing.T) { t.Logf("running %q", cmd.Args) output, err := cmd.Output() if err != nil { - t.Fatalf("32bit binary tester failed.\n Output: %s", output) + t.Logf("32bit binary tester failed.\n Err: %v\nOutput: %s", + err, output) } } From 7ac5fe9d5b5b2bcad5e89c9ecf3b2f52476d1f4b Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Thu, 28 Dec 2023 10:40:46 +0100 Subject: [PATCH 24/24] add better error message to Test_K8sSecretsProvider_Fetch_Cache_Enabled --- .../kubernetessecrets/kubernetes_secrets_test.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go b/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go index 78d632d6437..9f468fcabfe 100644 --- a/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go +++ b/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go @@ -6,6 +6,8 @@ package kubernetessecrets import ( "context" + "fmt" + "strings" "testing" "time" @@ -184,10 +186,20 @@ func Test_K8sSecretsProvider_Fetch_Cache_Enabled(t *testing.T) { // wait for ttl update <-time.After(refreshInterval) + status := &strings.Builder{} + duration := refreshInterval * 3 assert.Eventuallyf(t, func() bool { val, found = fp.Fetch(key) - return found && val == newPass - }, refreshInterval*3, refreshInterval, "Failed to update the secret value after TTL update has passed.") + isNewPass := val == newPass + if found && isNewPass { + return true + } + + fmt.Fprintf(status, "found: %t, isNewPass: %t", found, isNewPass) + return false + }, duration, refreshInterval, + "Failed to update the secret value after TTL update has passed. Tried fetching for %d. Last status: %s", + duration, status) // After TTL delete, secret should no longer be found in cache since it was never // fetched during that time