Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix install/enroll command not failing when the daemon restart fails #3815

Merged
merged 25 commits into from
Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Surface errors during Agent's enroll process, failing if any happens.

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/3815/

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/3664
2 changes: 1 addition & 1 deletion dev-tools/mage/godaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
}
)

// BuildGoDaemon builds the go-deamon binary.
// BuildGoDaemon builds the go-daemon binary.
func BuildGoDaemon() error {
if GOOS != "linux" {
return errors.New("go-daemon only builds for linux")
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/agent/cmd/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ func buildEnrollArgs(cfg setupConfig, token string, policyID string) ([]string,
"--path.home", paths.Top(), // --path.home actually maps to paths.Top()
"--path.config", paths.Config(),
"--path.logs", paths.Logs(),
"--skip-daemon-reload",
}
if paths.Downloads() != "" {
args = append(args, "--path.downloads", paths.Downloads())
Expand Down
11 changes: 10 additions & 1 deletion internal/pkg/agent/cmd/enroll.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ func addEnrollFlags(cmd *cobra.Command) {
cmd.Flags().BoolP("delay-enroll", "", false, "Delays enrollment to occur on first start of the Elastic Agent service")
cmd.Flags().DurationP("daemon-timeout", "", 0, "Timeout waiting for Elastic Agent daemon")
cmd.Flags().DurationP("fleet-server-timeout", "", 0, "Timeout waiting for Fleet Server to be ready to start enrollment")
cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling")
belimawr marked this conversation as resolved.
Show resolved Hide resolved
cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags")

cmd.Flags().MarkHidden("skip-daemon-reload") //nolint:errcheck // an error is only returned if the flag does not exist.
}

func validateEnrollFlags(cmd *cobra.Command) error {
Expand Down Expand Up @@ -141,6 +144,7 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
fTags, _ := cmd.Flags().GetStringSlice("tag")
args := []string{}
if url != "" {
Expand Down Expand Up @@ -249,6 +253,9 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string
args = append(args, "--fleet-server-es-insecure")
}

if skipDaemonReload {
args = append(args, "--skip-daemon-reload")
}
for _, v := range fTags {
args = append(args, "--tag", v)
}
Expand Down Expand Up @@ -338,6 +345,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
delayEnroll, _ := cmd.Flags().GetBool("delay-enroll")
daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout")
fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout")
skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload")
tags, _ := cmd.Flags().GetStringSlice("tag")

caStr, _ := cmd.Flags().GetString("certificate-authorities")
Expand All @@ -351,7 +359,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
var fixPermissions bool = fromInstall
fixPermissions := fromInstall
if runtime.GOOS == "darwin" {
fixPermissions = false
}
Expand All @@ -370,6 +378,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
ProxyHeaders: mapFromEnvList(proxyHeaders),
DelayEnroll: delayEnroll,
DaemonTimeout: daemonTimeout,
SkipDaemonRestart: skipDaemonReload,
Tags: tags,
FleetServer: enrollCmdFleetServerOption{
ConnStr: fServer,
Expand Down
92 changes: 60 additions & 32 deletions internal/pkg/agent/cmd/enroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ import (
"strings"
"time"

"github.com/elastic/elastic-agent/pkg/utils"

"github.com/elastic/elastic-agent/pkg/control/v2/client"

"go.elastic.co/apm"
"gopkg.in/yaml.v2"

Expand All @@ -42,8 +38,10 @@ import (
fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client"
"github.com/elastic/elastic-agent/internal/pkg/release"
"github.com/elastic/elastic-agent/internal/pkg/remote"
"github.com/elastic/elastic-agent/pkg/control/v2/client"
"github.com/elastic/elastic-agent/pkg/core/logger"
"github.com/elastic/elastic-agent/pkg/core/process"
"github.com/elastic/elastic-agent/pkg/utils"
)

const (
Expand Down Expand Up @@ -115,6 +113,7 @@ type enrollCmdOption struct {
DelayEnroll bool `yaml:"-"`
FleetServer enrollCmdFleetServerOption `yaml:"-"`
SkipCreateSecret bool `yaml:"-"`
SkipDaemonRestart bool `yaml:"-"`
Tags []string `yaml:"omitempty"`
}

Expand Down Expand Up @@ -174,7 +173,7 @@ func newEnrollCmd(
)
}

// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
func newEnrollCmdWithStore(
log *logger.Logger,
options *enrollCmdOption,
Expand All @@ -189,10 +188,11 @@ func newEnrollCmdWithStore(
}, nil
}

// Execute tries to enroll the agent into Fleet.
// Execute enrolls the agent into Fleet.
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
var err error
defer c.stopAgent() // ensure its stopped no matter what

span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
defer func() {
apm.CaptureError(ctx, err).Send()
Expand Down Expand Up @@ -237,7 +237,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
// Ensure that the agent does not use a proxy configuration
// when connecting to the local fleet server.
// Note that when running fleet-server the enroll request will be sent to :8220,
// however when the agent is running afterwards requests will be sent to :8221
// however when the agent is running afterward requests will be sent to :8221
c.remoteConfig.Transport.Proxy.Disable = true
}

Expand All @@ -258,7 +258,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {

err = c.enrollWithBackoff(ctx, persistentConfig)
if err != nil {
return errors.New(err, "fail to enroll")
return fmt.Errorf("fail to enroll: %w", err)
}

if c.options.FixPermissions {
Expand All @@ -269,17 +269,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
}

defer func() {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
if err != nil {
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
} else {
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
}
}()

if c.agentProc == nil {
if err := c.daemonReload(ctx); err != nil {
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
} else {
c.log.Info("Successfully triggered restart on running Elastic Agent.")
if c.agentProc == nil && !c.options.SkipDaemonRestart {
if err = c.daemonReloadWithBackoff(ctx); err != nil {
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
}

c.log.Info("Successfully triggered restart on running Elastic Agent.")
return nil
}

c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
return nil
}
Expand Down Expand Up @@ -444,25 +450,34 @@ func (c *enrollCmd) prepareFleetTLS() error {
}

func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
err := c.daemonReload(ctx)
if err == nil {
return nil
}

signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, 1*time.Second, 1*time.Minute)

for i := 5; i >= 0; i-- {
backExp.Wait()
c.log.Info("Retrying to restart...")
err = c.daemonReload(ctx)
var lastErr error
for i := 0; i < 5; i++ {
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
attempt := i

c.log.Infof("Restarting agent daemon, attempt %d", attempt)
err := c.daemonReload(ctx)
if err == nil {
break
return nil
}

cmacknz marked this conversation as resolved.
Show resolved Hide resolved
// If the context was cancelled, return early
if errors.Is(err, context.DeadlineExceeded) ||
errors.Is(err, context.Canceled) {
return fmt.Errorf("could not reload daemon after %d retries: %w",
attempt, err)
}
lastErr = err

c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String())
backExp.Wait()

}

close(signal)
return err
return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", lastErr)
}

func (c *enrollCmd) daemonReload(ctx context.Context) error {
Expand All @@ -480,8 +495,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[

c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
err := c.enroll(ctx, persistentConfig)
if err == nil {
return nil
}

const deadline = 10 * time.Minute
const frequency = 60 * time.Second

c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
deadline,
frequency,
c.client.URI())
signal := make(chan struct{})
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
defer close(signal)
backExp := backoff.NewExpBackoff(signal, frequency, deadline)

for {
retry := false
Expand All @@ -500,7 +527,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
err = c.enroll(ctx, persistentConfig)
}

close(signal)
return err
}

Expand Down Expand Up @@ -549,8 +575,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
c.options.FleetServer.ElasticsearchInsecure,
)
if err != nil {
return err
return fmt.Errorf(
"failed creating fleet-server bootstrap config: %w", err)
}

// no longer need bootstrap at this point
serverConfig.Server.Bootstrap = false
fleetConfig.Server = serverConfig.Server
Expand All @@ -570,11 +598,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte

reader, err := yamlToReader(configToStore)
if err != nil {
return err
return fmt.Errorf("yamlToReader failed: %w", err)
}

if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
return err
return fmt.Errorf("failed to store agent config: %w", err)
}

// clear action store
Expand Down
Loading
Loading