From 91f0603193c0f7bcd703d17c8f5a6c0487c3e04c Mon Sep 17 00:00:00 2001 From: Tiago Queiroz Date: Thu, 28 Dec 2023 14:00:31 +0100 Subject: [PATCH] Fix install/enroll command not failing when the daemon restart fails (#3815) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix install/enroll cmd not failing when agent restart fails * surface errors that might occur during enroll * fail install command if agent cannot be restarted * do not print success message if there was an enroll error. Print an error message and the error instead * add logs to show the different enroll attempts * add more context t errors * refactor internal/pkg/agent/install/perms_unix.go and add more context to errors restore main version * ignore agent restart error on enroll tests as there is no agent to be restarted * daemonReloadWithBackoff does not retry on context deadline exceeded * Do not reload the Agent daemon if enrolling from a container The enroll command would always try to restart the daemon, however when enrolling as part of the container command, there is no running daemon to reload. This commit adds a CLI flag, --skip-daemon-reload, to the enroll command to skip the reloading step, the container command now makes use of this flag. * Apply suggestions from code review Co-authored-by: Paolo Chilà * PR improvements * Add integration test * make lint happy * PR improvements * Fix after rebase * Fix some issues * more PR improvments * Fix enroll command * Fix TestContainterCMD * Fix implementation * Remove flaky integration test assertion Asserting there are no errors in the logs from Elastic-Agent and all Co-authored-by: Anderson Queriroz Co-authored-by: Paolo Chilà Co-authored-by: Pierre HILBERT (cherry picked from commit 938f0b93fe9316ee59d428c6424bf275516a8ddb) # Conflicts: # internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go --- ...enroll-process-failing-if-any-happens.yaml | 32 +++ dev-tools/mage/godaemon.go | 2 +- internal/pkg/agent/cmd/container.go | 1 + internal/pkg/agent/cmd/enroll.go | 11 +- internal/pkg/agent/cmd/enroll_cmd.go | 92 +++++--- internal/pkg/agent/cmd/enroll_cmd_test.go | 51 ++-- internal/pkg/agent/cmd/install.go | 4 +- internal/pkg/agent/cmd/platformcheck_test.go | 5 +- internal/pkg/agent/install/perms_unix.go | 29 ++- .../kubernetes_secrets_test.go | 222 ++++++++++++++++++ testing/integration/container_cmd_test.go | 128 ++++++++++ testing/integration/logs_ingestion_test.go | 4 +- 12 files changed, 513 insertions(+), 68 deletions(-) create mode 100644 changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml create mode 100644 testing/integration/container_cmd_test.go diff --git a/changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml b/changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml new file mode 100644 index 00000000000..916920a83e1 --- /dev/null +++ b/changelog/fragments/1700851577-Surface-errors-during-Agent_s-enroll-process-failing-if-any-happens.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Surface errors during Agent's enroll process, failing if any happens. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3815/ + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/3664 diff --git a/dev-tools/mage/godaemon.go b/dev-tools/mage/godaemon.go index 90960bfe69f..40d5e94564b 100644 --- a/dev-tools/mage/godaemon.go +++ b/dev-tools/mage/godaemon.go @@ -21,7 +21,7 @@ var ( } ) -// BuildGoDaemon builds the go-deamon binary. +// BuildGoDaemon builds the go-daemon binary. func BuildGoDaemon() error { if GOOS != "linux" { return errors.New("go-daemon only builds for linux") diff --git a/internal/pkg/agent/cmd/container.go b/internal/pkg/agent/cmd/container.go index 321e2fd07b3..7d3fc1df022 100644 --- a/internal/pkg/agent/cmd/container.go +++ b/internal/pkg/agent/cmd/container.go @@ -397,6 +397,7 @@ func buildEnrollArgs(cfg setupConfig, token string, policyID string) ([]string, "--path.home", paths.Top(), // --path.home actually maps to paths.Top() "--path.config", paths.Config(), "--path.logs", paths.Logs(), + "--skip-daemon-reload", } if paths.Downloads() != "" { args = append(args, "--path.downloads", paths.Downloads()) diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 1bce5f7e547..95a13341a87 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -75,7 +75,10 @@ func addEnrollFlags(cmd *cobra.Command) { cmd.Flags().BoolP("delay-enroll", "", false, "Delays enrollment to occur on first start of the Elastic Agent service") cmd.Flags().DurationP("daemon-timeout", "", 0, "Timeout waiting for Elastic Agent daemon") cmd.Flags().DurationP("fleet-server-timeout", "", 0, "Timeout waiting for Fleet Server to be ready to start enrollment") + cmd.Flags().Bool("skip-daemon-reload", false, "Skip daemon reload after enrolling") cmd.Flags().StringSliceP("tag", "", []string{}, "User set tags") + + cmd.Flags().MarkHidden("skip-daemon-reload") //nolint:errcheck // an error is only returned if the flag does not exist. } func validateEnrollFlags(cmd *cobra.Command) error { @@ -141,6 +144,7 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string delayEnroll, _ := cmd.Flags().GetBool("delay-enroll") daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout") fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout") + skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload") fTags, _ := cmd.Flags().GetStringSlice("tag") args := []string{} if url != "" { @@ -249,6 +253,9 @@ func buildEnrollmentFlags(cmd *cobra.Command, url string, token string) []string args = append(args, "--fleet-server-es-insecure") } + if skipDaemonReload { + args = append(args, "--skip-daemon-reload") + } for _, v := range fTags { args = append(args, "--tag", v) } @@ -338,6 +345,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { delayEnroll, _ := cmd.Flags().GetBool("delay-enroll") daemonTimeout, _ := cmd.Flags().GetDuration("daemon-timeout") fTimeout, _ := cmd.Flags().GetDuration("fleet-server-timeout") + skipDaemonReload, _ := cmd.Flags().GetBool("skip-daemon-reload") tags, _ := cmd.Flags().GetStringSlice("tag") caStr, _ := cmd.Flags().GetString("certificate-authorities") @@ -351,7 +359,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - var fixPermissions bool = fromInstall + fixPermissions := fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } @@ -370,6 +378,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { ProxyHeaders: mapFromEnvList(proxyHeaders), DelayEnroll: delayEnroll, DaemonTimeout: daemonTimeout, + SkipDaemonRestart: skipDaemonReload, Tags: tags, FleetServer: enrollCmdFleetServerOption{ ConnStr: fServer, diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 37540ae5249..d3de412ba44 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -15,10 +15,6 @@ import ( "strings" "time" - "github.com/elastic/elastic-agent/pkg/utils" - - "github.com/elastic/elastic-agent/pkg/control/v2/client" - "go.elastic.co/apm" "gopkg.in/yaml.v2" @@ -42,8 +38,10 @@ import ( fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" "github.com/elastic/elastic-agent/internal/pkg/release" "github.com/elastic/elastic-agent/internal/pkg/remote" + "github.com/elastic/elastic-agent/pkg/control/v2/client" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/core/process" + "github.com/elastic/elastic-agent/pkg/utils" ) const ( @@ -115,6 +113,7 @@ type enrollCmdOption struct { DelayEnroll bool `yaml:"-"` FleetServer enrollCmdFleetServerOption `yaml:"-"` SkipCreateSecret bool `yaml:"-"` + SkipDaemonRestart bool `yaml:"-"` Tags []string `yaml:"omitempty"` } @@ -174,7 +173,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates an new enrollment and accept a custom store. +// newEnrollCmdWithStore creates a new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -189,10 +188,11 @@ func newEnrollCmdWithStore( }, nil } -// Execute tries to enroll the agent into Fleet. +// Execute enrolls the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what + span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -237,7 +237,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterwards requests will be sent to :8221 + // however when the agent is running afterward requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -258,7 +258,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return errors.New(err, "fail to enroll") + return fmt.Errorf("fail to enroll: %w", err) } if c.options.FixPermissions { @@ -269,17 +269,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + if err != nil { + fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) + } else { + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + } }() - if c.agentProc == nil { - if err := c.daemonReload(ctx); err != nil { - c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) - } else { - c.log.Info("Successfully triggered restart on running Elastic Agent.") + if c.agentProc == nil && !c.options.SkipDaemonRestart { + if err = c.daemonReloadWithBackoff(ctx); err != nil { + c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) + return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) } + + c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } + c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -444,25 +450,34 @@ func (c *enrollCmd) prepareFleetTLS() error { } func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { - err := c.daemonReload(ctx) - if err == nil { - return nil - } - signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, 1*time.Second, 1*time.Minute) - for i := 5; i >= 0; i-- { - backExp.Wait() - c.log.Info("Retrying to restart...") - err = c.daemonReload(ctx) + var lastErr error + for i := 0; i < 5; i++ { + attempt := i + + c.log.Infof("Restarting agent daemon, attempt %d", attempt) + err := c.daemonReload(ctx) if err == nil { - break + return nil } + + // If the context was cancelled, return early + if errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { + return fmt.Errorf("could not reload daemon after %d retries: %w", + attempt, err) + } + lastErr = err + + c.log.Errorf("Restart attempt %d failed: '%s'. Waiting for %s", attempt, err, backExp.NextWait().String()) + backExp.Wait() + } - close(signal) - return err + return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", lastErr) } func (c *enrollCmd) daemonReload(ctx context.Context) error { @@ -480,8 +495,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) + if err == nil { + return nil + } + + const deadline = 10 * time.Minute + const frequency = 60 * time.Second + + c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", + deadline, + frequency, + c.client.URI()) signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, frequency, deadline) for { retry := false @@ -500,7 +527,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } - close(signal) return err } @@ -549,8 +575,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return err + return fmt.Errorf( + "failed creating fleet-server bootstrap config: %w", err) } + // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -570,11 +598,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return err + return fmt.Errorf("yamlToReader failed: %w", err) } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return err + return fmt.Errorf("failed to store agent config: %w", err) } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 0b1e7d5d4ee..2d28db5a4f8 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -17,7 +17,9 @@ import ( "runtime" "strconv" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -152,6 +154,7 @@ func TestEnroll(t *testing.T) { EnrollAPIKey: "my-enrollment-api-key", UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -159,14 +162,18 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() - config, err := readConfig(store.Content) + if err := cmd.Execute(ctx, streams); err != nil { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + config, err := readConfig(store.Content) require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -209,6 +216,7 @@ func TestEnroll(t *testing.T) { Insecure: true, UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -216,16 +224,20 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() - require.True(t, store.Called) + if err := cmd.Execute(ctx, streams); err != nil { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + assert.True(t, store.Called) config, err := readConfig(store.Content) - - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + require.NoError(t, err, "readConfig returned an error") + assert.Equal(t, "my-access-api-key", config.AccessAPIKey, + "The stored 'Access API Key' must be the same returned by Fleet-Server") + assert.Equal(t, host, config.Client.Host, + "The stored Fleet-Server host must match the one used during enrol") }, )) @@ -268,6 +280,7 @@ func TestEnroll(t *testing.T) { Insecure: true, UserProvidedMetadata: map[string]interface{}{"custom": "customize"}, SkipCreateSecret: skipCreateSecret, + SkipDaemonRestart: true, }, "", store, @@ -275,16 +288,16 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + require.NoError(t, err, "enroll command should return no error") + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index a1415d0e59b..002a8d1b98a 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -169,7 +169,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") + fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") return nil } } @@ -232,6 +232,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } + + fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/cmd/platformcheck_test.go b/internal/pkg/agent/cmd/platformcheck_test.go index f1abbbc0ddd..857a486ef19 100644 --- a/internal/pkg/agent/cmd/platformcheck_test.go +++ b/internal/pkg/agent/cmd/platformcheck_test.go @@ -29,13 +29,16 @@ func TestCheckPlatformCompat(t *testing.T) { cmd.Stderr = os.Stderr cmd.Env = append(os.Environ(), "GOARCH=386") require.NoError(t, cmd.Run(), "failed to compile test helper") + t.Logf("compiled test binary %q", helper) // run test helper cmd = exec.Command(helper, "-test.v", "-test.run", "TestHelper") cmd.Env = []string{"GO_USE_HELPER=1"} + t.Logf("running %q", cmd.Args) output, err := cmd.Output() if err != nil { - t.Logf("32bit binary tester failed.\n Output: %s", output) + t.Logf("32bit binary tester failed.\n Err: %v\nOutput: %s", + err, output) } } diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index 578c2af0a30..8cafc79fe8b 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,6 +8,7 @@ package install import ( "errors" + "fmt" "io/fs" "os" "path/filepath" @@ -18,18 +19,24 @@ import ( // FixPermissions fixes the permissions so only root:root is the owner and no world read-able permissions func FixPermissions(topPath string, ownership utils.FileOwner) error { return filepath.Walk(topPath, func(name string, info fs.FileInfo, err error) error { - if err == nil { - // all files should be owned by uid:gid - // uses `os.Lchown` so the symlink is updated to have the permissions - err = os.Lchown(name, ownership.UID, ownership.GID) - if err != nil { - return err - } - // remove any world permissions from the file - err = os.Chmod(name, info.Mode().Perm()&0770) - } else if errors.Is(err, fs.ErrNotExist) { + if errors.Is(err, fs.ErrNotExist) { return nil } - return err + if err != nil { + return fmt.Errorf("walk on %q failed: %w", topPath, err) + } + + // all files should be owned by uid:gid + // uses `os.Lchown` so the symlink is updated to have the permissions + if err := os.Lchown(name, ownership.UID, ownership.GID); err != nil { + return fmt.Errorf("cannot update ownership of %q: %w", topPath, err) + } + + // remove any world permissions from the file + if err := os.Chmod(name, info.Mode().Perm()&0770); err != nil { + return fmt.Errorf("could not update permissions of %q: %w", topPath, err) + } + + return nil }) } diff --git a/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go b/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go index 9924c84e6bc..1d7a01e50da 100644 --- a/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go +++ b/internal/pkg/composable/providers/kubernetessecrets/kubernetes_secrets_test.go @@ -6,6 +6,8 @@ package kubernetessecrets import ( "context" + "fmt" + "strings" "testing" "time" @@ -138,3 +140,223 @@ func Test_K8sSecretsProvider_FetchWrongSecret(t *testing.T) { assert.False(t, found) assert.EqualValues(t, val, "") } +<<<<<<< HEAD +======= + +func Test_K8sSecretsProvider_Fetch_Cache_Enabled(t *testing.T) { + client := k8sfake.NewSimpleClientset() + + ttlDelete, err := time.ParseDuration("1s") + require.NoError(t, err) + + refreshInterval, err := time.ParseDuration("100ms") + require.NoError(t, err) + + secret := &v1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: "apps/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "testing_secret", + Namespace: ns, + }, + Data: map[string][]byte{ + "secret_value": []byte(pass), + }, + } + _, err = client.CoreV1().Secrets(ns).Create(context.Background(), secret, metav1.CreateOptions{}) + require.NoError(t, err) + + logger := logp.NewLogger("test_k8s_secrets") + + c := map[string]interface{}{ + "cache_refresh_interval": refreshInterval, + "cache_ttl": ttlDelete, + "cache_disable": false, + } + cfg, err := config.NewConfigFrom(c) + require.NoError(t, err) + + p, err := ContextProviderBuilder(logger, cfg, true) + require.NoError(t, err) + + fp, _ := p.(*contextProviderK8sSecrets) + + getK8sClientFunc = func(kubeconfig string, opt kubernetes.KubeClientOptions) (k8sclient.Interface, error) { + return client, nil + } + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + comm := ctesting.NewContextComm(ctx) + + go func() { + _ = fp.Run(ctx, comm) + }() + + for { + fp.clientMx.Lock() + client := fp.client + fp.clientMx.Unlock() + if client != nil { + break + } + <-time.After(10 * time.Millisecond) + } + + // Secret cache should be empty at start + fp.secretsCacheMx.Lock() + assert.Equal(t, 0, len(fp.secretsCache)) + fp.secretsCacheMx.Unlock() + + key := "kubernetes_secrets.test_namespace.testing_secret.secret_value" + + // Secret should be in the cache after this call + val, found := fp.Fetch(key) + assert.True(t, found) + assert.Equal(t, val, pass) + fp.secretsCacheMx.RLock() + assert.Equal(t, len(fp.secretsCache), 1) + assert.NotNil(t, fp.secretsCache[key]) + assert.NotZero(t, fp.secretsCache[key].lastAccess) + fp.secretsCacheMx.RUnlock() + + // Update the secret and check after TTL time, the secret value is correct + newPass := "new-pass" + secret = &v1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: "apps/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "testing_secret", + Namespace: ns, + }, + Data: map[string][]byte{ + "secret_value": []byte(newPass), + }, + } + _, err = client.CoreV1().Secrets(ns).Update(context.Background(), secret, metav1.UpdateOptions{}) + require.NoError(t, err) + + // wait for ttl update + <-time.After(refreshInterval) + status := &strings.Builder{} + duration := refreshInterval * 3 + assert.Eventuallyf(t, func() bool { + val, found = fp.Fetch(key) + isNewPass := val == newPass + if found && isNewPass { + return true + } + + fmt.Fprintf(status, "found: %t, isNewPass: %t", found, isNewPass) + return false + }, duration, refreshInterval, + "Failed to update the secret value after TTL update has passed. Tried fetching for %d. Last status: %s", + duration, status) + + // After TTL delete, secret should no longer be found in cache since it was never + // fetched during that time + <-time.After(ttlDelete) + assert.Eventuallyf(t, func() bool { + fp.secretsCacheMx.RLock() + size := len(fp.secretsCache) + fp.secretsCacheMx.RUnlock() + return size == 0 + }, ttlDelete*3, ttlDelete, "Failed to delete the secret after TTL delete has passed.") + +} + +func Test_K8sSecretsProvider_Fetch_Cache_Disabled(t *testing.T) { + client := k8sfake.NewSimpleClientset() + + secret := &v1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: "apps/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "testing_secret", + Namespace: ns, + }, + Data: map[string][]byte{ + "secret_value": []byte(pass), + }, + } + _, err := client.CoreV1().Secrets(ns).Create(context.Background(), secret, metav1.CreateOptions{}) + require.NoError(t, err) + + logger := logp.NewLogger("test_k8s_secrets") + + c := map[string]interface{}{ + "cache_disable": true, + } + cfg, err := config.NewConfigFrom(c) + require.NoError(t, err) + + p, err := ContextProviderBuilder(logger, cfg, true) + require.NoError(t, err) + + fp, _ := p.(*contextProviderK8sSecrets) + + getK8sClientFunc = func(kubeconfig string, opt kubernetes.KubeClientOptions) (k8sclient.Interface, error) { + return client, nil + } + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + comm := ctesting.NewContextComm(ctx) + + go func() { + _ = fp.Run(ctx, comm) + }() + + for { + fp.clientMx.Lock() + client := fp.client + fp.clientMx.Unlock() + if client != nil { + break + } + <-time.After(10 * time.Millisecond) + } + + key := "kubernetes_secrets.test_namespace.testing_secret.secret_value" + + // Secret should be in the cache after this call + val, found := fp.Fetch(key) + assert.True(t, found) + assert.Equal(t, val, pass) + + // Update the secret and check the result + newPass := "new-pass" + secret = &v1.Secret{ + TypeMeta: metav1.TypeMeta{ + Kind: "Secret", + APIVersion: "apps/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "testing_secret", + Namespace: ns, + }, + Data: map[string][]byte{ + "secret_value": []byte(newPass), + }, + } + _, err = client.CoreV1().Secrets(ns).Update(context.Background(), secret, metav1.UpdateOptions{}) + require.NoError(t, err) + + val, found = fp.Fetch(key) + assert.True(t, found) + assert.Equal(t, val, newPass) + + // Check key that does not exist + val, found = fp.Fetch(key + "doesnotexist") + assert.False(t, found) + assert.Equal(t, "", val) +} +>>>>>>> 938f0b93fe (Fix install/enroll command not failing when the daemon restart fails (#3815)) diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go new file mode 100644 index 00000000000..d3649e529ae --- /dev/null +++ b/testing/integration/container_cmd_test.go @@ -0,0 +1,128 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "context" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" +) + +func TestContainerCMD(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: true, + OS: []define.OS{ + {Type: define.Linux}, + }, + // This test runs the command we use when executing inside a container + // which leaves files under /usr/share/elastic-agent. Run it isolated + // to avoid interfering with other tests and better simulate a container + // environment we run it in isolation + Group: "container", + }) + ctx := context.Background() + + agentFixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + createPolicyReq := kibana.AgentPolicy{ + Name: fmt.Sprintf("test-policy-enroll-%d", time.Now().Unix()), + Namespace: info.Namespace, + Description: "test policy for agent enrollment", + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + AgentFeatures: []map[string]interface{}{ + { + "name": "test_enroll", + "enabled": true, + }, + }, + } + + // Create policy + policy, err := info.KibanaClient.CreatePolicy(ctx, createPolicyReq) + if err != nil { + t.Fatalf("could not create Agent Policy: %s", err) + } + + // Create enrollment API key + createEnrollmentAPIKeyReq := kibana.CreateEnrollmentAPIKeyRequest{ + PolicyID: policy.ID, + } + + t.Logf("Creating enrollment API key...") + enrollmentToken, err := info.KibanaClient.CreateEnrollmentAPIKey(ctx, createEnrollmentAPIKeyReq) + if err != nil { + t.Fatalf("unable to create enrolment API key: %s", err) + } + + fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient) + if err != nil { + t.Fatalf("could not get Fleet URL: %s", err) + } + + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"}) + if err != nil { + t.Fatalf("could not prepare agent command: %s", err) + } + + t.Cleanup(func() { + if cmd.Process != nil { + t.Log(">> cleaning up: killing the Elastic-Agent process") + if err := cmd.Process.Kill(); err != nil { + t.Fatalf("could not kill Elastic-Agent process: %s", err) + } + return + } + t.Log(">> cleaning up: no process to kill") + }) + + agentOutput := strings.Builder{} + cmd.Stderr = &agentOutput + cmd.Stdout = &agentOutput + cmd.Env = append(os.Environ(), + "FLEET_ENROLL=1", + "FLEET_URL="+fleetURL, + "FLEET_ENROLLMENT_TOKEN="+enrollmentToken.APIKey, + ) + + t.Logf(">> running binary with: %v", cmd.Args) + if err := cmd.Start(); err != nil { + t.Fatalf("error running container cmd: %s", err) + } + + require.Eventuallyf(t, func() bool { + var healthy bool + // This will returns errors until it connects to the agent, + // they're mostly noise because until the agent starts running + // we will get connection errors. If the test fails + // the agent logs will be present in the error message + // which should help to explain why the agent was not + // healthy. + healthy, err = agentFixture.IsHealthy(ctx) + return healthy + }, + 5*time.Minute, time.Second, + "Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s", + err, &agentOutput, + ) +} diff --git a/testing/integration/logs_ingestion_test.go b/testing/integration/logs_ingestion_test.go index fa879ffafef..e8a86df4a0b 100644 --- a/testing/integration/logs_ingestion_test.go +++ b/testing/integration/logs_ingestion_test.go @@ -151,7 +151,7 @@ func testMonitoringLogsAreShipped( } require.Empty(t, docs.Hits.Hits) - // Stage 4: Make sure we have message confirming central management is running + // Stage 3: Make sure we have message confirming central management is running t.Log("Making sure we have message confirming central management is running") docs = findESDocs(t, func() (estools.Documents, error) { return estools.FindMatchingLogLines(ctx, info.ESClient, info.Namespace, @@ -159,7 +159,7 @@ func testMonitoringLogsAreShipped( }) require.NotZero(t, len(docs.Hits.Hits)) - // Stage 5: verify logs from the monitoring components are not sent to the output + // Stage 4: verify logs from the monitoring components are not sent to the output t.Log("Check monitoring logs") hostname, err := os.Hostname() if err != nil {