diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml new file mode 100644 index 00000000000..f8361f99433 --- /dev/null +++ b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Surface errors during Agent's enroll process, failing if any happens. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: install/enroll + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3207 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/dev-tools/mage/godaemon.go b/dev-tools/mage/godaemon.go index 90960bfe69f..40d5e94564b 100644 --- a/dev-tools/mage/godaemon.go +++ b/dev-tools/mage/godaemon.go @@ -21,7 +21,7 @@ var ( } ) -// BuildGoDaemon builds the go-deamon binary. +// BuildGoDaemon builds the go-daemon binary. func BuildGoDaemon() error { if GOOS != "linux" { return errors.New("go-daemon only builds for linux") diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 1bce5f7e547..adaa278f32f 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - var fixPermissions bool = fromInstall + fixPermissions := fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 153c488e37a..f72183c9745 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -189,7 +189,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates an new enrollment and accept a custom store. +// newEnrollCmdWithStore creates a new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -204,10 +204,11 @@ func newEnrollCmdWithStore( }, nil } -// Execute tries to enroll the agent into Fleet. +// Execute enrolls the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what + span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -252,7 +253,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterwards requests will be sent to :8221 + // however when the agent is running afterward requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -273,7 +274,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return errors.New(err, "fail to enroll") + return fmt.Errorf("fail to enroll: %w", err) } if c.options.FixPermissions { @@ -284,17 +285,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + if err != nil { + fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) + } else { + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + } }() if c.agentProc == nil { - if err := c.daemonReload(ctx); err != nil { - c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) - } else { - c.log.Info("Successfully triggered restart on running Elastic Agent.") + if err = c.daemonReloadWithBackoff(ctx); err != nil { + c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) + return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) } + + c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } + c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -460,24 +467,35 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload daemon: %w", err) + } if err == nil { return nil } signal := make(chan struct{}) + defer close(signal) backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) - for i := 5; i >= 0; i-- { + for i := 0; i < 5; i++ { backExp.Wait() c.log.Info("Retrying to restart...") err = c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + i+1, err) + } if err == nil { - break + return nil } } - close(signal) - return err + return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err) } func (c *enrollCmd) daemonReload(ctx context.Context) (err error) { @@ -501,8 +519,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) + if err == nil { + return nil + } + + const deadline = 10 * time.Minute + const frequency = 60 * time.Second + + c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", + deadline, + frequency, + c.client.URI()) signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, frequency, deadline) for { retry := false @@ -521,7 +551,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } - close(signal) return err } @@ -570,8 +599,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return err + return fmt.Errorf( + "failed creating fleet-server bootstrap config: %w", err) } + // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -591,11 +622,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return err + return fmt.Errorf("yamlToReader failed: %w", err) } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return err + return fmt.Errorf("failed to store agent config: %w", err) } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index d34bf200148..ca62f82024d 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -17,8 +17,11 @@ import ( "os" "runtime" "strconv" + "strings" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -160,14 +163,23 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + require.ErrorContainsf(t, err, + "could not reload agent daemon, unable to trigger restart", + "enroll command returned an unexpected error") + require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(), + "it should fail only due to %q", context.DeadlineExceeded) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -217,16 +229,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.NoError(t, err) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -276,16 +296,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index 4fbd37f40da..2cb46bd599d 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") + fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") return nil } } @@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } + + fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index e84dcd5039c..fc357fd4fde 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,6 +8,7 @@ package install import ( "errors" + "fmt" "io/fs" "os" "path/filepath" @@ -18,19 +19,26 @@ func fixPermissions(topPath string) error { return recursiveRootPermissions(topPath) } -func recursiveRootPermissions(path string) error { - return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { - if err == nil { - // all files should be owned by root:root - err = os.Chown(name, 0, 0) - if err != nil { - return err - } - // remove any world permissions from the file - err = os.Chmod(name, info.Mode().Perm()&0770) - } else if errors.Is(err, fs.ErrNotExist) { +func recursiveRootPermissions(root string) error { + return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error { + if errors.Is(err, fs.ErrNotExist) { return nil } - return err + if err != nil { + return fmt.Errorf("walk on %q failed: %w", path, err) + } + + // all files should be owned by root:root + err = os.Chown(path, 0, 0) + if err != nil { + return fmt.Errorf("could not fix ownership of %q: %w", path, err) + } + // remove any world permissions from the file + err = os.Chmod(path, info.Mode().Perm()&0770) + if err != nil { + return fmt.Errorf("could not fix permissions of %q: %w", path, err) + } + + return nil }) }