Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "install fails if enroll fails and surface errors" #3629

Merged
merged 1 commit into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

2 changes: 1 addition & 1 deletion dev-tools/mage/godaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
}
)

// BuildGoDaemon builds the go-daemon binary.
// BuildGoDaemon builds the go-deamon binary.
func BuildGoDaemon() error {
if GOOS != "linux" {
return errors.New("go-daemon only builds for linux")
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/agent/cmd/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
fixPermissions := fromInstall
var fixPermissions bool = fromInstall
if runtime.GOOS == "darwin" {
fixPermissions = false
}
Expand Down
67 changes: 18 additions & 49 deletions internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func newEnrollCmd(
)
}

// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
func newEnrollCmdWithStore(
log *logger.Logger,
options *enrollCmdOption,
Expand All @@ -187,11 +187,10 @@ func newEnrollCmdWithStore(
}, nil
}

// Execute enrolls the agent into Fleet.
// Execute tries to enroll the agent into Fleet.
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
var err error
defer c.stopAgent() // ensure its stopped no matter what

span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
defer func() {
apm.CaptureError(ctx, err).Send()
Expand Down Expand Up @@ -236,7 +235,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
// Ensure that the agent does not use a proxy configuration
// when connecting to the local fleet server.
// Note that when running fleet-server the enroll request will be sent to :8220,
// however when the agent is running afterward requests will be sent to :8221
// however when the agent is running afterwards requests will be sent to :8221
c.remoteConfig.Transport.Proxy.Disable = true
}

Expand All @@ -257,7 +256,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {

err = c.enrollWithBackoff(ctx, persistentConfig)
if err != nil {
return fmt.Errorf("fail to enroll: %w", err)
return errors.New(err, "fail to enroll")
}

if c.options.FixPermissions {
Expand All @@ -268,23 +267,17 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
}

defer func() {
if err != nil {
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
} else {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}()

if c.agentProc == nil {
if err = c.daemonReloadWithBackoff(ctx); err != nil {
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
if err := c.daemonReload(ctx); err != nil {
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
} else {
c.log.Info("Successfully triggered restart on running Elastic Agent.")
}

c.log.Info("Successfully triggered restart on running Elastic Agent.")
return nil
}

c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
return nil
}
Expand Down Expand Up @@ -450,35 +443,24 @@ func (c *enrollCmd) prepareFleetTLS() error {

func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
err := c.daemonReload(ctx)
if err != nil &&
(errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled)) {
return fmt.Errorf("could not reload daemon: %w", err)
}
if err == nil {
return nil
}

signal := make(chan struct{})
defer close(signal)
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)

for i := 0; i < 5; i++ {
for i := 5; i >= 0; i-- {
backExp.Wait()
c.log.Info("Retrying to restart...")
err = c.daemonReload(ctx)
if err != nil &&
(errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled)) {
return fmt.Errorf("could not reload daemon after %d retries: %w",
i+1, err)
}
if err == nil {
return nil
break
}
}

return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err)
close(signal)
return err
}

func (c *enrollCmd) daemonReload(ctx context.Context) error {
Expand All @@ -496,20 +478,8 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[

c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
err := c.enroll(ctx, persistentConfig)
if err == nil {
return nil
}

const deadline = 10 * time.Minute
const frequency = 60 * time.Second

c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
deadline,
frequency,
c.client.URI())
signal := make(chan struct{})
defer close(signal)
backExp := backoff.NewExpBackoff(signal, frequency, deadline)
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)

for {
retry := false
Expand All @@ -528,6 +498,7 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
err = c.enroll(ctx, persistentConfig)
}

close(signal)
return err
}

Expand Down Expand Up @@ -576,10 +547,8 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
c.options.FleetServer.ElasticsearchInsecure,
)
if err != nil {
return fmt.Errorf(
"failed creating fleet-server bootstrap config: %w", err)
return err
}

// no longer need bootstrap at this point
serverConfig.Server.Bootstrap = false
fleetConfig.Server = serverConfig.Server
Expand All @@ -599,11 +568,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte

reader, err := yamlToReader(configToStore)
if err != nil {
return fmt.Errorf("yamlToReader failed: %w", err)
return err
}

if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
return fmt.Errorf("failed to store agent config: %w", err)
return err
}

// clear action store
Expand Down
72 changes: 22 additions & 50 deletions internal/pkg/agent/cmd/enroll_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,8 @@ import (
"os"
"runtime"
"strconv"
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
Expand Down Expand Up @@ -162,23 +159,14 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
require.ErrorContainsf(t, err,
"could not reload agent daemon, unable to trigger restart",
"enroll command returned an unexpected error")
require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(),
"it should fail only due to %q", context.DeadlineExceeded)
config, err := readConfig(store.Content)
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
config, err := readConfig(store.Content)

require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -228,24 +216,16 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)
if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

config, err := readConfig(store.Content)

assert.NoError(t, err)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
require.NoError(t, err)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
},
))

Expand Down Expand Up @@ -295,24 +275,16 @@ func TestEnroll(t *testing.T) {
require.NoError(t, err)

streams, _, _, _ := cli.NewTestingIOStreams()
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
err = cmd.Execute(ctx, streams)

if err != nil &&
// There is no agent running, therefore nothing to be restarted.
// However, this will cause the Enroll command to return an error
// which we'll ignore here.
!strings.Contains(err.Error(),
"could not reload agent daemon, unable to trigger restart") {
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
}

assert.True(t, store.Called)
err = cmd.Execute(context.Background(), streams)
require.NoError(t, err)

require.True(t, store.Called)

config, err := readConfig(store.Content)

require.NoError(t, err)
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
assert.Equal(t, host, config.Client.Host)
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
require.Equal(t, host, config.Client.Host)
},
))

Expand Down
4 changes: 1 addition & 3 deletions internal/pkg/agent/cmd/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
return fmt.Errorf("problem reading prompt response")
}
if url == "" {
fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
return nil
}
}
Expand Down Expand Up @@ -224,8 +224,6 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
}
}()
}

fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
}

if enroll {
Expand Down
32 changes: 12 additions & 20 deletions internal/pkg/agent/install/perms_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ package install

import (
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
Expand All @@ -19,26 +18,19 @@ func fixPermissions(topPath string) error {
return recursiveRootPermissions(topPath)
}

func recursiveRootPermissions(root string) error {
return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
if errors.Is(err, fs.ErrNotExist) {
func recursiveRootPermissions(path string) error {
return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
if err == nil {
// all files should be owned by root:root
err = os.Chown(name, 0, 0)
if err != nil {
return err
}
// remove any world permissions from the file
err = os.Chmod(name, info.Mode().Perm()&0770)
} else if errors.Is(err, fs.ErrNotExist) {
return nil
}
if err != nil {
return fmt.Errorf("walk on %q failed: %w", path, err)
}

// all files should be owned by root:root
err = os.Chown(path, 0, 0)
if err != nil {
return fmt.Errorf("could not fix ownership of %q: %w", path, err)
}
// remove any world permissions from the file
err = os.Chmod(path, info.Mode().Perm()&0770)
if err != nil {
return fmt.Errorf("could not fix permissions of %q: %w", path, err)
}

return nil
return err
})
}
Loading