From 2201b6d5499ac84a35b271a9bc5898fb4da3d1df Mon Sep 17 00:00:00 2001
From: Anderson Queiroz <anderson.queiroz@elastic.co>
Date: Thu, 5 Oct 2023 15:35:44 +0200
Subject: [PATCH 1/5] refactor testing/integration and pkg/testing (#3378)

* pkg/testing/tools
  - split the 'tools' into smaller packages with well defined scope: 'estools', 'fleettools' and 'check'
  - adjust method names for consistency

* testing/integration:
  - make diagnostics to require to be run on a remote host
  - add more context to errors
  - reorder functions on endpoint_security_test so tests are the first functions and when the test fails because the 'Endpoint' directory isn't removed, it now prints the content of the directory
---
 pkg/component/runtime/manager_test.go         |  91 ++++++-----
 pkg/testing/fixture_install.go                |  14 +-
 pkg/testing/tools/artifacts_api.go            |   6 +-
 pkg/testing/tools/check/check.go              |  29 +++-
 pkg/testing/tools/cmd.go                      |  32 ----
 .../tools/{ => estools}/elasticsearch.go      |   2 +-
 .../tools/{agents.go => fleettools/fleet.go}  |  47 ++----
 pkg/testing/tools/tools.go                    |  69 ++++----
 testing/integration/diagnostics_test.go       |   4 +-
 testing/integration/endpoint_security_test.go | 152 ++++++++++--------
 testing/integration/fqdn_test.go              |  17 +-
 testing/integration/monitoring_logs_test.go   |  24 +--
 testing/integration/proxy_url_test.go         |  12 +-
 testing/integration/upgrade_fleet_test.go     |  15 +-
 14 files changed, 263 insertions(+), 251 deletions(-)
 delete mode 100644 pkg/testing/tools/cmd.go
 rename pkg/testing/tools/{ => estools}/elasticsearch.go (99%)
 rename pkg/testing/tools/{agents.go => fleettools/fleet.go} (82%)

diff --git a/pkg/component/runtime/manager_test.go b/pkg/component/runtime/manager_test.go
index d00df10db64..cbce15b13c5 100644
--- a/pkg/component/runtime/manager_test.go
+++ b/pkg/component/runtime/manager_test.go
@@ -1371,52 +1371,59 @@ func TestManager_FakeInput_GoodUnitToBad(t *testing.T) {
 				t.Logf("component state changed: %+v", state)
 				if state.State == client.UnitStateFailed {
 					subErrCh <- fmt.Errorf("component failed: %s", state.Message)
+					return
+				}
+
+				unit, ok := state.Units[ComponentUnitKey{UnitType: client.UnitTypeInput, UnitID: "good-input"}]
+				if !ok {
+					subErrCh <- errors.New("unit missing: good-input")
+					return
+				}
+				if unitGood {
+					switch unit.State {
+					case client.UnitStateFailed:
+						subErrCh <- fmt.Errorf("unit failed: %s", unit.Message)
+						return
+					case client.UnitStateHealthy:
+						// good unit it; now make it bad
+						t.Logf("marking good-input as having a hard-error for config")
+						updatedComp := comp
+						updatedComp.Units = make([]component.Unit, len(comp.Units))
+						copy(updatedComp.Units, comp.Units)
+						updatedComp.Units[1] = component.Unit{
+							ID:   "good-input",
+							Type: client.UnitTypeInput,
+							Err:  errors.New("hard-error for config"),
+						}
+						unitGood = false
+						err := m.Update(component.Model{Components: []component.Component{updatedComp}})
+						if err != nil {
+							subErrCh <- err
+						}
+					case client.UnitStateStarting:
+						// acceptable
+					default:
+						// unknown state that should not have occurred
+						subErrCh <- fmt.Errorf("unit reported unexpected state: %v", unit.State)
+					}
 				} else {
-					unit, ok := state.Units[ComponentUnitKey{UnitType: client.UnitTypeInput, UnitID: "good-input"}]
-					if ok {
-						if unitGood {
-							if unit.State == client.UnitStateFailed {
-								subErrCh <- fmt.Errorf("unit failed: %s", unit.Message)
-							} else if unit.State == client.UnitStateHealthy {
-								// good unit it; now make it bad
-								t.Logf("marking good-input as having a hard-error for config")
-								updatedComp := comp
-								updatedComp.Units = make([]component.Unit, len(comp.Units))
-								copy(updatedComp.Units, comp.Units)
-								updatedComp.Units[1] = component.Unit{
-									ID:   "good-input",
-									Type: client.UnitTypeInput,
-									Err:  errors.New("hard-error for config"),
-								}
-								unitGood = false
-								err := m.Update(component.Model{Components: []component.Component{updatedComp}})
-								if err != nil {
-									subErrCh <- err
-								}
-							} else if unit.State == client.UnitStateStarting {
-								// acceptable
-							} else {
-								// unknown state that should not have occurred
-								subErrCh <- fmt.Errorf("unit reported unexpected state: %v", unit.State)
-							}
-						} else {
-							if unit.State == client.UnitStateFailed {
-								// went to failed; stop whole component
-								err := m.Update(component.Model{Components: []component.Component{}})
-								if err != nil {
-									subErrCh <- err
-								}
-							} else if unit.State == client.UnitStateStopped {
-								// unit was stopped
-								subErrCh <- nil
-							} else {
-								subErrCh <- errors.New("good-input unit should be either failed or stopped")
-							}
+					switch unit.State {
+					case client.UnitStateFailed:
+						// went to failed; stop whole component
+						err := m.Update(component.Model{Components: []component.Component{}})
+						if err != nil {
+							subErrCh <- err
 						}
-					} else {
-						subErrCh <- errors.New("unit missing: good-input")
+					case client.UnitStateStopped:
+						// unit was stopped
+						subErrCh <- nil
+					default:
+						subErrCh <- fmt.Errorf(
+							"good-input unit should be either failed or stopped, but it's '%s'",
+							unit.State)
 					}
 				}
+
 			}
 		}
 	}()
diff --git a/pkg/testing/fixture_install.go b/pkg/testing/fixture_install.go
index 2e515213533..9ec684a811b 100644
--- a/pkg/testing/fixture_install.go
+++ b/pkg/testing/fixture_install.go
@@ -84,8 +84,12 @@ func (i InstallOpts) toCmdArgs() []string {
 	return args
 }
 
-// Install installs the prepared Elastic Agent binary and returns:
-//   - the combined output of stdout and stderr
+// Install installs the prepared Elastic Agent binary and registers a t.Cleanup
+// function to uninstall the agent if it hasn't been uninstalled. It also takes
+// care of collecting a diagnostics when AGENT_COLLECT_DIAG=true or the test
+// has failed.
+// It returns:
+//   - the combined output of Install command stdout and stderr
 //   - an error if any.
 func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts ...process.CmdOption) ([]byte, error) {
 	f.t.Logf("[test %s] Inside fixture install function", f.t.Name())
@@ -134,12 +138,12 @@ func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts ..
 
 		// environment variable AGENT_KEEP_INSTALLED=true will skip the uninstall
 		// useful to debug the issue with the Elastic Agent
-		if f.t.Failed() && keepInstalled() {
+		if f.t.Failed() && keepInstalledFlag() {
 			f.t.Logf("skipping uninstall; test failed and AGENT_KEEP_INSTALLED=true")
 			return
 		}
 
-		if keepInstalled() {
+		if keepInstalledFlag() {
 			f.t.Logf("ignoring AGENT_KEEP_INSTALLED=true as test succeeded, " +
 				"keeping the agent installed will jeopardise other tests")
 		}
@@ -327,7 +331,7 @@ func collectDiagFlag() bool {
 	return v
 }
 
-func keepInstalled() bool {
+func keepInstalledFlag() bool {
 	// failure reports false (ignore error)
 	v, _ := strconv.ParseBool(os.Getenv("AGENT_KEEP_INSTALLED"))
 	return v
diff --git a/pkg/testing/tools/artifacts_api.go b/pkg/testing/tools/artifacts_api.go
index 84fdf226df1..0e91d212af3 100644
--- a/pkg/testing/tools/artifacts_api.go
+++ b/pkg/testing/tools/artifacts_api.go
@@ -135,7 +135,7 @@ type ArtifactAPIClient struct {
 	url string
 }
 
-// Creates a new Artifact API client
+// NewArtifactAPIClient creates a new Artifact API client
 func NewArtifactAPIClient(opts ...ArtifactAPIClientOpt) *ArtifactAPIClient {
 	c := &ArtifactAPIClient{
 		url: defaultArtifactAPIURL,
@@ -204,7 +204,7 @@ func (aac ArtifactAPIClient) GetBuildDetails(ctx context.Context, version string
 	return checkResponseAndUnmarshal[BuildDetails](resp)
 }
 
-func (aac *ArtifactAPIClient) composeURL(relativePath string) (string, error) {
+func (aac ArtifactAPIClient) composeURL(relativePath string) (string, error) {
 	joinedURL, err := url.JoinPath(aac.url, relativePath)
 	if err != nil {
 		return "", fmt.Errorf("composing URL with %q %q: %w", aac.url, relativePath, err)
@@ -213,7 +213,7 @@ func (aac *ArtifactAPIClient) composeURL(relativePath string) (string, error) {
 	return joinedURL, nil
 }
 
-func (aac *ArtifactAPIClient) createAndPerformRequest(ctx context.Context, URL string) (*http.Response, error) {
+func (aac ArtifactAPIClient) createAndPerformRequest(ctx context.Context, URL string) (*http.Response, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, URL, nil)
 	if err != nil {
 		err = fmt.Errorf("composing request: %w", err)
diff --git a/pkg/testing/tools/check/check.go b/pkg/testing/tools/check/check.go
index b5815fce255..03f95b3e17e 100644
--- a/pkg/testing/tools/check/check.go
+++ b/pkg/testing/tools/check/check.go
@@ -11,14 +11,16 @@ import (
 
 	"github.com/stretchr/testify/assert"
 
+	"github.com/elastic/elastic-agent-libs/kibana"
 	"github.com/elastic/elastic-agent/pkg/control/v2/cproto"
 	integrationtest "github.com/elastic/elastic-agent/pkg/testing"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
 )
 
 // ConnectedToFleet checks if the agent defined in the fixture is connected to
 // Fleet Server. It uses assert.Eventually and if it fails the last error will
 // be printed. It returns if the agent is connected to Fleet Server or not.
-func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool {
+func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture, timeout time.Duration) bool {
 	t.Helper()
 
 	var err error
@@ -28,7 +30,7 @@ func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool {
 		return agentStatus.FleetState == int(cproto.State_HEALTHY)
 	}
 
-	connected := assert.Eventually(t, assertFn, 5*time.Minute, 5*time.Second,
+	connected := assert.Eventually(t, assertFn, timeout, 5*time.Second,
 		"want fleet state %s, got %s. agent status: %v",
 		cproto.State_HEALTHY, cproto.State(agentStatus.FleetState), agentStatus)
 
@@ -39,3 +41,26 @@ func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool {
 
 	return connected
 }
+
+// FleetAgentStatus returns a niladic function that returns true if the agent
+// has reached expectedStatus; false otherwise. The returned function is intended
+// for use with assert.Eventually or require.Eventually.
+func FleetAgentStatus(t *testing.T,
+	client *kibana.Client,
+	policyID,
+	expectedStatus string) func() bool {
+	return func() bool {
+		currentStatus, err := fleettools.GetAgentStatus(client, policyID)
+		if err != nil {
+			t.Errorf("unable to determine agent status: %s", err.Error())
+			return false
+		}
+
+		if currentStatus == expectedStatus {
+			return true
+		}
+
+		t.Logf("Agent fleet status: %s", currentStatus)
+		return false
+	}
+}
diff --git a/pkg/testing/tools/cmd.go b/pkg/testing/tools/cmd.go
deleted file mode 100644
index 9277c3ea5c6..00000000000
--- a/pkg/testing/tools/cmd.go
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-// or more contributor license agreements. Licensed under the Elastic License;
-// you may not use this file except in compliance with the Elastic License.
-
-package tools
-
-import (
-	"context"
-	"time"
-
-	atesting "github.com/elastic/elastic-agent/pkg/testing"
-)
-
-// InstallAgent force install the Elastic Agent through agentFixture.
-func InstallAgent(ctx context.Context, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture) ([]byte, error) {
-	// 2 minute timeout, to ensure that it at least doesn't get stuck.
-	ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
-	defer cancel()
-	return agentFixture.Install(ctx, &installOpts)
-}
-
-// InstallStandaloneAgent force install the Elastic Agent through agentFixture.
-func InstallStandaloneAgent(ctx context.Context, agentFixture *atesting.Fixture) ([]byte, error) {
-	// 2 minute timeout, to ensure that it at least doesn't get stuck.
-	ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
-	defer cancel()
-	installOpts := atesting.InstallOpts{
-		NonInteractive: true,
-		Force:          true,
-	}
-	return agentFixture.Install(ctx, &installOpts)
-}
diff --git a/pkg/testing/tools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go
similarity index 99%
rename from pkg/testing/tools/elasticsearch.go
rename to pkg/testing/tools/estools/elasticsearch.go
index 0d0b040d4dd..8cd6e126597 100644
--- a/pkg/testing/tools/elasticsearch.go
+++ b/pkg/testing/tools/estools/elasticsearch.go
@@ -2,7 +2,7 @@
 // or more contributor license agreements. Licensed under the Elastic License;
 // you may not use this file except in compliance with the Elastic License.
 
-package tools
+package estools
 
 import (
 	"bytes"
diff --git a/pkg/testing/tools/agents.go b/pkg/testing/tools/fleettools/fleet.go
similarity index 82%
rename from pkg/testing/tools/agents.go
rename to pkg/testing/tools/fleettools/fleet.go
index e440a927846..baa1ca659c0 100644
--- a/pkg/testing/tools/agents.go
+++ b/pkg/testing/tools/fleettools/fleet.go
@@ -2,21 +2,15 @@
 // or more contributor license agreements. Licensed under the Elastic License;
 // you may not use this file except in compliance with the Elastic License.
 
-package tools
+package fleettools
 
 import (
 	"context"
 	"errors"
 	"fmt"
 	"os"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/elastic-agent-libs/kibana"
-	"github.com/elastic/elastic-agent/pkg/control/v2/client"
-	"github.com/elastic/elastic-agent/pkg/control/v2/cproto"
 )
 
 // GetAgentByPolicyIDAndHostnameFromList get an agent by the local_metadata.host.name property, reading from the agents list
@@ -47,6 +41,14 @@ func GetAgentByPolicyIDAndHostnameFromList(client *kibana.Client, policyID, host
 	return hostnameAgents[0], nil
 }
 
+func GetAgentIDByHostname(client *kibana.Client, policyID, hostname string) (string, error) {
+	agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname)
+	if err != nil {
+		return "", err
+	}
+	return agent.Agent.ID, nil
+}
+
 func GetAgentStatus(client *kibana.Client, policyID string) (string, error) {
 	hostname, err := os.Hostname()
 	if err != nil {
@@ -97,15 +99,8 @@ func UnEnrollAgent(client *kibana.Client, policyID string) error {
 	return nil
 }
 
-func GetAgentIDByHostname(client *kibana.Client, policyID, hostname string) (string, error) {
-	agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname)
-	if err != nil {
-		return "", err
-	}
-	return agent.Agent.ID, nil
-}
-
 func UpgradeAgent(client *kibana.Client, policyID, version string, force bool) error {
+	// TODO: fix me: this does not work if FQDN is enabled
 	hostname, err := os.Hostname()
 	if err != nil {
 		return err
@@ -128,7 +123,7 @@ func UpgradeAgent(client *kibana.Client, policyID, version string, force bool) e
 	return nil
 }
 
-func GetDefaultFleetServerURL(client *kibana.Client) (string, error) {
+func DefaultURL(client *kibana.Client) (string, error) {
 	req := kibana.ListFleetServerHostsRequest{}
 	resp, err := client.ListFleetServerHosts(context.Background(), req)
 	if err != nil {
@@ -144,23 +139,5 @@ func GetDefaultFleetServerURL(client *kibana.Client) (string, error) {
 		}
 	}
 
-	return "", errors.New("unable to determine default fleet server host")
-}
-
-func WaitForAgent(ctx context.Context, t *testing.T, c client.Client) {
-	require.Eventually(t, func() bool {
-		err := c.Connect(ctx)
-		if err != nil {
-			t.Logf("connecting client to agent: %v", err)
-			return false
-		}
-		defer c.Disconnect()
-		state, err := c.State(ctx)
-		if err != nil {
-			t.Logf("error getting the agent state: %v", err)
-			return false
-		}
-		t.Logf("agent state: %+v", state)
-		return state.State == cproto.State_HEALTHY
-	}, 2*time.Minute, 10*time.Second, "Agent never became healthy")
+	return "", errors.New("unable to determine default fleet server URL")
 }
diff --git a/pkg/testing/tools/tools.go b/pkg/testing/tools/tools.go
index 6972d5228f6..666657a5795 100644
--- a/pkg/testing/tools/tools.go
+++ b/pkg/testing/tools/tools.go
@@ -10,50 +10,39 @@ import (
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/assert"
+
 	"github.com/elastic/elastic-agent-libs/kibana"
 	atesting "github.com/elastic/elastic-agent/pkg/testing"
-
-	"github.com/stretchr/testify/require"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/check"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
 )
 
-// WaitForAgentStatus returns a niladic function that returns true if the agent
-// has reached expectedStatus; false otherwise. The returned function is intended
-// for use with assert.Eventually or require.Eventually.
-func WaitForAgentStatus(t *testing.T, client *kibana.Client, policyID string, expectedStatus string) func() bool {
-	return func() bool {
-		currentStatus, err := GetAgentStatus(client, policyID)
-		if err != nil {
-			t.Errorf("unable to determine agent status: %s", err.Error())
-			return false
-		}
-
-		if currentStatus == expectedStatus {
-			return true
-		}
-
-		t.Logf("Agent status: %s", currentStatus)
-		return false
-	}
-}
-
-// WaitForPolicyRevision returns a niladic function that returns true if the
+// IsPolicyRevision returns a niladic function that returns true if the
 // given agent's policy revision has reached the given policy revision; false
 // otherwise. The returned function is intended
 // for use with assert.Eventually or require.Eventually.
-func WaitForPolicyRevision(t *testing.T, client *kibana.Client, agentID string, expectedPolicyRevision int) func() bool {
+func IsPolicyRevision(t *testing.T, client *kibana.Client, agentID string, policyRevision int) func() bool {
 	return func() bool {
 		getAgentReq := kibana.GetAgentRequest{ID: agentID}
 		updatedPolicyAgent, err := client.GetAgent(context.Background(), getAgentReq)
-		require.NoError(t, err)
+		if err != nil {
+			t.Logf("failed to get agent document to check policy revision: %v", err)
+			return false
+		}
 
-		return updatedPolicyAgent.PolicyRevision == expectedPolicyRevision
+		return updatedPolicyAgent.PolicyRevision == policyRevision
 	}
 }
 
 // InstallAgentWithPolicy creates the given policy, enrolls the given agent
 // fixture in Fleet using the default Fleet Server, waits for the agent to be
 // online, and returns the created policy.
-func InstallAgentWithPolicy(ctx context.Context, t *testing.T, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture, kibClient *kibana.Client, createPolicyReq kibana.AgentPolicy) (kibana.PolicyResponse, error) {
+func InstallAgentWithPolicy(ctx context.Context, t *testing.T,
+	installOpts atesting.InstallOpts,
+	agentFixture *atesting.Fixture,
+	kibClient *kibana.Client,
+	createPolicyReq kibana.AgentPolicy) (kibana.PolicyResponse, error) {
 	t.Helper()
 
 	// Create policy
@@ -110,7 +99,7 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T,
 	}
 
 	// Get default Fleet Server URL
-	fleetServerURL, err := GetDefaultFleetServerURL(kibClient)
+	fleetServerURL, err := fleettools.DefaultURL(kibClient)
 	if err != nil {
 		return fmt.Errorf("unable to get default Fleet Server URL: %w", err)
 	}
@@ -121,7 +110,8 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T,
 		URL:             fleetServerURL,
 		EnrollmentToken: enrollmentToken.APIKey,
 	}
-	output, err := InstallAgent(ctx, installOpts, agentFixture)
+
+	output, err := agentFixture.Install(ctx, &installOpts)
 	if err != nil {
 		t.Log(string(output))
 		return fmt.Errorf("unable to enroll Elastic Agent: %w", err)
@@ -132,10 +122,18 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T,
 	if deadline, ok := ctx.Deadline(); ok {
 		timeout = time.Until(deadline)
 	}
+
+	assert.Eventually(
+		t,
+		check.FleetAgentStatus(t, kibClient, policyID, "online"),
+		timeout,
+		10*time.Second,
+		"Elastic Agent status is not online",
+	)
 	// Wait for Agent to be healthy
-	require.Eventually(
+	assert.Eventually(
 		t,
-		WaitForAgentStatus(t, kibClient, policyID, "online"),
+		check.FleetAgentStatus(t, kibClient, policyID, "online"),
 		timeout,
 		10*time.Second,
 		"Elastic Agent status is not online",
@@ -143,3 +141,12 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T,
 
 	return nil
 }
+
+// InstallStandaloneAgent force install the Elastic Agent through agentFixture.
+func InstallStandaloneAgent(agentFixture *atesting.Fixture) ([]byte, error) {
+	installOpts := atesting.InstallOpts{
+		NonInteractive: true,
+		Force:          true,
+	}
+	return agentFixture.Install(context.Background(), &installOpts)
+}
diff --git a/testing/integration/diagnostics_test.go b/testing/integration/diagnostics_test.go
index 9b181c0075d..00d8d97aed0 100644
--- a/testing/integration/diagnostics_test.go
+++ b/testing/integration/diagnostics_test.go
@@ -88,7 +88,7 @@ type componentAndUnitNames struct {
 
 func TestDiagnosticsOptionalValues(t *testing.T) {
 	define.Require(t, define.Requirements{
-		Local: true,
+		Local: false,
 	})
 
 	fixture, err := define.NewFixture(t, define.Version())
@@ -113,7 +113,7 @@ func TestDiagnosticsOptionalValues(t *testing.T) {
 
 func TestDiagnosticsCommand(t *testing.T) {
 	define.Require(t, define.Requirements{
-		Local: true,
+		Local: false,
 	})
 
 	f, err := define.NewFixture(t, define.Version())
diff --git a/testing/integration/endpoint_security_test.go b/testing/integration/endpoint_security_test.go
index 04ff8fbedb4..f3943604c1d 100644
--- a/testing/integration/endpoint_security_test.go
+++ b/testing/integration/endpoint_security_test.go
@@ -19,11 +19,10 @@ import (
 	"text/template"
 	"time"
 
+	"github.com/google/uuid"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
-	"github.com/google/uuid"
-
 	"github.com/elastic/elastic-agent-libs/kibana"
 	"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
 	"github.com/elastic/elastic-agent/pkg/control/v2/client"
@@ -31,6 +30,7 @@ import (
 	atesting "github.com/elastic/elastic-agent/pkg/testing"
 	"github.com/elastic/elastic-agent/pkg/testing/define"
 	"github.com/elastic/elastic-agent/pkg/testing/tools"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
 	"github.com/elastic/elastic-agent/pkg/testing/tools/testcontext"
 )
 
@@ -90,6 +90,60 @@ func TestInstallAndCLIUninstallWithEndpointSecurity(t *testing.T) {
 	}
 }
 
+// Tests that the agent can install and uninstall the endpoint-security service while remaining
+// healthy. In this case endpoint-security is uninstalled because the agent was unenrolled, which
+// triggers the creation of an empty agent policy removing all inputs (only when not force
+// unenrolling). The empty agent policy triggers the uninstall of endpoint because endpoint was
+// removed from the policy.
+//
+// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test
+// but at this point endpoint is already uninstalled.
+func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) {
+	info := define.Require(t, define.Requirements{
+		Stack:   &define.Stack{},
+		Local:   false, // requires Agent installation
+		Isolate: false,
+		Sudo:    true, // requires Agent installation
+		OS: []define.OS{
+			{Type: define.Linux},
+		},
+	})
+
+	for _, tc := range protectionTests {
+		t.Run(tc.name, func(t *testing.T) {
+			testInstallAndUnenrollWithEndpointSecurity(t, info, tc.protected)
+		})
+	}
+}
+
+// Tests that the agent can install and uninstall the endpoint-security service
+// after the Elastic Defend integration was removed from the policy
+// while remaining healthy.
+//
+// Installing endpoint-security requires a Fleet managed agent with the Elastic Defend integration
+// installed. The endpoint-security service is uninstalled the Elastic Defend integration was removed from the policy.
+//
+// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test
+// but at this point endpoint should be already uninstalled.
+
+func TestInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T) {
+	info := define.Require(t, define.Requirements{
+		Stack:   &define.Stack{},
+		Local:   false, // requires Agent installation
+		Isolate: false,
+		Sudo:    true, // requires Agent installation
+		OS: []define.OS{
+			{Type: define.Linux},
+		},
+	})
+
+	for _, tc := range protectionTests {
+		t.Run(tc.name, func(t *testing.T) {
+			testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t, info, tc.protected)
+		})
+	}
+}
+
 // buildPolicyWithTamperProtection helper function to build the policy request with or without tamper protection
 func buildPolicyWithTamperProtection(policy kibana.AgentPolicy, protected bool) kibana.AgentPolicy {
 	if protected {
@@ -109,7 +163,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I
 
 	// Get path to agent executable.
 	fixture, err := define.NewFixture(t, define.Version())
-	require.NoError(t, err)
+	require.NoError(t, err, "could not create agent fixture")
 
 	t.Log("Enrolling the agent in Fleet")
 	policyUUID := uuid.New().String()
@@ -138,7 +192,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I
 
 	t.Cleanup(func() {
 		t.Log("Un-enrolling Elastic Agent...")
-		assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID))
+		assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID))
 	})
 
 	t.Log("Installing Elastic Defend")
@@ -151,7 +205,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I
 
 	agentClient := fixture.Client()
 	err = agentClient.Connect(ctx)
-	require.NoError(t, err)
+	require.NoError(t, err, "could not connect to local agent")
 
 	require.Eventually(t,
 		func() bool { return agentAndEndpointAreHealthy(t, ctx, agentClient) },
@@ -162,32 +216,6 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I
 	t.Log("Verified endpoint component and units are healthy")
 }
 
-// Tests that the agent can install and uninstall the endpoint-security service while remaining
-// healthy. In this case endpoint-security is uninstalled because the agent was unenrolled, which
-// triggers the creation of an empty agent policy removing all inputs (only when not force
-// unenrolling). The empty agent policy triggers the uninstall of endpoint because endpoint was
-// removed from the policy.
-//
-// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test
-// but at this point endpoint is already uninstalled.
-func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) {
-	info := define.Require(t, define.Requirements{
-		Stack:   &define.Stack{},
-		Local:   false, // requires Agent installation
-		Isolate: false,
-		Sudo:    true, // requires Agent installation
-		OS: []define.OS{
-			{Type: define.Linux},
-		},
-	})
-
-	for _, tc := range protectionTests {
-		t.Run(tc.name, func(t *testing.T) {
-			testInstallAndUnenrollWithEndpointSecurity(t, info, tc.protected)
-		})
-	}
-}
-
 func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info, protected bool) {
 	// Get path to agent executable.
 	fixture, err := define.NewFixture(t, define.Version())
@@ -244,7 +272,7 @@ func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info,
 	hostname, err := os.Hostname()
 	require.NoError(t, err)
 
-	agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname)
+	agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname)
 	require.NoError(t, err)
 
 	_, err = info.KibanaClient.UnEnrollAgent(ctx, kibana.UnEnrollAgentRequest{ID: agentID})
@@ -299,34 +327,6 @@ func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info,
 	}
 }
 
-// Tests that the agent can install and uninstall the endpoint-security service
-// after the Elastic Defend integration was removed from the policy
-// while remaining healthy.
-//
-// Installing endpoint-security requires a Fleet managed agent with the Elastic Defend integration
-// installed. The endpoint-security service is uninstalled the Elastic Defend integration was removed from the policy.
-//
-// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test
-// but at this point endpoint should be already uninstalled.
-
-func TestInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T) {
-	info := define.Require(t, define.Requirements{
-		Stack:   &define.Stack{},
-		Local:   false, // requires Agent installation
-		Isolate: false,
-		Sudo:    true, // requires Agent installation
-		OS: []define.OS{
-			{Type: define.Linux},
-		},
-	})
-
-	for _, tc := range protectionTests {
-		t.Run(tc.name, func(t *testing.T) {
-			testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t, info, tc.protected)
-		})
-	}
-}
-
 func testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T, info *define.Info, protected bool) {
 	// Get path to agent executable.
 	fixture, err := define.NewFixture(t, define.Version())
@@ -404,7 +404,29 @@ func testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T, i
 		}
 
 		t.Log("Found directory", f.Name())
-		require.False(t, strings.Contains(f.Name(), "Endpoint"), "Endpoint directory was not removed")
+		// If Endpoint was not currently removed, let's see what was left
+		if strings.Contains(f.Name(), "Endpoint") {
+			info, err := f.Info()
+			if err != nil {
+				t.Logf("could not get file info for %q to check what was left"+
+					"behind: %v", f.Name(), err)
+			}
+			ls, err := os.ReadDir(info.Name())
+			if err != nil {
+				t.Logf("could not list fileson for %q to check what was left"+
+					"behind: %v", f.Name(), err)
+			}
+			var dirEntries []string
+			for _, de := range ls {
+				dirEntries = append(dirEntries, de.Name())
+			}
+
+			if len(dirEntries) == 0 {
+				t.Fatalf("Endpoint directory was not removed, but it's empty")
+			}
+			t.Fatalf("Endpoint directory was not removed, the directory content is: %s",
+				strings.Join(dirEntries, ", "))
+		}
 	}
 }
 
@@ -545,7 +567,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c
 	}
 
 	if state.State != client.Healthy {
-		t.Logf("Agent is not Healthy\n%+v", state)
+		t.Logf("local Agent is not Healthy: current state: %+v", state)
 		return false
 	}
 
@@ -554,7 +576,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c
 	for _, comp := range state.Components {
 		isEndpointComponent := strings.Contains(comp.Name, "endpoint")
 		if comp.State != client.Healthy {
-			t.Logf("Component is not Healthy\n%+v", comp)
+			t.Logf("endpoint component is not Healthy: current state: %+v", comp)
 			return false
 		}
 
@@ -569,7 +591,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c
 			}
 
 			if unit.State != client.Healthy {
-				t.Logf("Unit is not Healthy\n%+v", unit)
+				t.Logf("unit %q is not Healthy\n%+v", unit.UnitID, unit)
 				return false
 			}
 		}
@@ -577,7 +599,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c
 
 	// Ensure both the endpoint input and output units were found and healthy.
 	if !foundEndpointInputUnit || !foundEndpointOutputUnit {
-		t.Logf("State did not contain endpoint units!\n%+v", state)
+		t.Logf("State did not contain endpoint units. state: %+v", state)
 		return false
 	}
 
diff --git a/testing/integration/fqdn_test.go b/testing/integration/fqdn_test.go
index d1c5142acf5..9bb1b34de89 100644
--- a/testing/integration/fqdn_test.go
+++ b/testing/integration/fqdn_test.go
@@ -19,16 +19,15 @@ import (
 	"testing"
 	"time"
 
-	"github.com/elastic/go-elasticsearch/v8"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/elastic-agent-libs/kibana"
-
 	atesting "github.com/elastic/elastic-agent/pkg/testing"
 	"github.com/elastic/elastic-agent/pkg/testing/define"
 	"github.com/elastic/elastic-agent/pkg/testing/tools"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
+	"github.com/elastic/go-elasticsearch/v8"
 )
 
 func TestFQDN(t *testing.T) {
@@ -90,7 +89,7 @@ func TestFQDN(t *testing.T) {
 
 	t.Cleanup(func() {
 		t.Log("Un-enrolling Elastic Agent...")
-		assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID))
+		assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID))
 
 		t.Log("Restoring hostname...")
 		err := setHostname(context.Background(), origHostname, t.Log)
@@ -127,7 +126,7 @@ func TestFQDN(t *testing.T) {
 	expectedAgentPolicyRevision := agent.PolicyRevision + 1
 	require.Eventually(
 		t,
-		tools.WaitForPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision),
+		tools.IsPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision),
 		2*time.Minute,
 		1*time.Second,
 	)
@@ -158,7 +157,7 @@ func TestFQDN(t *testing.T) {
 	expectedAgentPolicyRevision++
 	require.Eventually(
 		t,
-		tools.WaitForPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision),
+		tools.IsPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision),
 		2*time.Minute,
 		1*time.Second,
 	)
@@ -182,7 +181,7 @@ func verifyAgentName(t *testing.T, policyID, hostname string, kibClient *kibana.
 	require.Eventually(
 		t,
 		func() bool {
-			agent, err = tools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policyID, hostname)
+			agent, err = fleettools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policyID, hostname)
 			return err == nil && agent != nil
 		},
 		5*time.Minute,
diff --git a/testing/integration/monitoring_logs_test.go b/testing/integration/monitoring_logs_test.go
index 5451b57cb61..97836c7ff3f 100644
--- a/testing/integration/monitoring_logs_test.go
+++ b/testing/integration/monitoring_logs_test.go
@@ -24,6 +24,8 @@ import (
 	"github.com/elastic/elastic-agent/pkg/testing/define"
 	"github.com/elastic/elastic-agent/pkg/testing/tools"
 	"github.com/elastic/elastic-agent/pkg/testing/tools/check"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/estools"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
 )
 
 func TestMonitoringLogsShipped(t *testing.T) {
@@ -69,11 +71,11 @@ func TestMonitoringLogsShipped(t *testing.T) {
 	require.NoError(t, err)
 	t.Logf("created policy: %s", policy.ID)
 
-	check.ConnectedToFleet(t, agentFixture)
+	check.ConnectedToFleet(t, agentFixture, 5*time.Minute)
 
 	// Stage 2: check indices
 	// This is mostly for debugging
-	resp, err := tools.GetAllindicies(info.ESClient)
+	resp, err := estools.GetAllindicies(info.ESClient)
 	require.NoError(t, err)
 	for _, run := range resp {
 		t.Logf("%s: %d/%d deleted: %d\n",
@@ -82,8 +84,8 @@ func TestMonitoringLogsShipped(t *testing.T) {
 
 	// Stage 3: Make sure metricbeat logs are populated
 	t.Log("Making sure metricbeat logs are populated")
-	docs := findESDocs(t, func() (tools.Documents, error) {
-		return tools.GetLogsForDatastream(info.ESClient, "elastic_agent.metricbeat")
+	docs := findESDocs(t, func() (estools.Documents, error) {
+		return estools.GetLogsForDatastream(info.ESClient, "elastic_agent.metricbeat")
 	})
 	require.NotZero(t, len(docs.Hits.Hits))
 	t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits))
@@ -101,8 +103,8 @@ func TestMonitoringLogsShipped(t *testing.T) {
 
 	// Stage 5: Make sure we have message confirming central management is running
 	t.Log("Making sure we have message confirming central management is running")
-	docs = findESDocs(t, func() (tools.Documents, error) {
-		return tools.FindMatchingLogLines(info.ESClient, info.Namespace,
+	docs = findESDocs(t, func() (estools.Documents, error) {
+		return estools.FindMatchingLogLines(info.ESClient, info.Namespace,
 			"Parsed configuration and determined agent is managed by Fleet")
 	})
 	require.NotZero(t, len(docs.Hits.Hits))
@@ -114,7 +116,7 @@ func TestMonitoringLogsShipped(t *testing.T) {
 		t.Fatalf("could not get hostname to filter Agent: %s", err)
 	}
 
-	agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname)
+	agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname)
 	require.NoError(t, err, "could not get Agent ID by hostname")
 	t.Logf("Agent ID: %q", agentID)
 
@@ -122,8 +124,8 @@ func TestMonitoringLogsShipped(t *testing.T) {
 	// this field is not mapped. There is an issue for that:
 	// https://github.com/elastic/integrations/issues/6545
 
-	docs = findESDocs(t, func() (tools.Documents, error) {
-		return tools.GetLogsForAgentID(info.ESClient, agentID)
+	docs = findESDocs(t, func() (estools.Documents, error) {
+		return estools.GetLogsForAgentID(info.ESClient, agentID)
 	})
 	require.NoError(t, err, "could not get logs from Agent ID: %q, err: %s",
 		agentID, err)
@@ -151,8 +153,8 @@ func TestMonitoringLogsShipped(t *testing.T) {
 	}
 }
 
-func findESDocs(t *testing.T, findFn func() (tools.Documents, error)) tools.Documents {
-	var docs tools.Documents
+func findESDocs(t *testing.T, findFn func() (estools.Documents, error)) estools.Documents {
+	var docs estools.Documents
 
 	require.Eventually(
 		t,
diff --git a/testing/integration/proxy_url_test.go b/testing/integration/proxy_url_test.go
index f37d3db945b..e95469405be 100644
--- a/testing/integration/proxy_url_test.go
+++ b/testing/integration/proxy_url_test.go
@@ -130,7 +130,7 @@ func TestProxyURL_EnrollProxyAndNoProxyInThePolicy(t *testing.T) {
 		require.NoError(t, err, "failed to install agent")
 	}
 
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 }
 
 func TestProxyURL_EnrollProxyAndEmptyProxyInThePolicy(t *testing.T) {
@@ -175,7 +175,7 @@ func TestProxyURL_EnrollProxyAndEmptyProxyInThePolicy(t *testing.T) {
 		require.NoError(t, err, "failed to install agent")
 	}
 
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 }
 
 func TestProxyURL_ProxyInThePolicyTakesPrecedence(t *testing.T) {
@@ -220,7 +220,7 @@ func TestProxyURL_ProxyInThePolicyTakesPrecedence(t *testing.T) {
 		require.NoError(t, err, "failed to install agent")
 	}
 
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 
 	// ensure the agent is communicating through the proxy set in the policy
 	want := fleetservertest.NewPathCheckin(p.policyData.AgentID)
@@ -282,7 +282,7 @@ func TestProxyURL_NoEnrollProxyAndProxyInThePolicy(t *testing.T) {
 		require.NoError(t, err, "failed to install agent")
 	}
 
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 
 	// ensure the agent is communicating through the new proxy
 	if !assert.Eventually(t, func() bool {
@@ -343,7 +343,7 @@ func TestProxyURL_RemoveProxyFromThePolicy(t *testing.T) {
 	}
 
 	// assert the agent is actually connected to fleet.
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 
 	// ensure the agent is communicating through the proxy set in the policy
 	if !assert.Eventually(t, func() bool {
@@ -389,7 +389,7 @@ func TestProxyURL_RemoveProxyFromThePolicy(t *testing.T) {
 	assert.Equal(t, inspect.Fleet.ProxyURL, want)
 
 	// assert, again, the agent is actually connected to fleet.
-	check.ConnectedToFleet(t, p.fixture)
+	check.ConnectedToFleet(t, p.fixture, 5*time.Minute)
 }
 
 func (p *ProxyURL) setupFleet(t *testing.T, fleetHost string) {
diff --git a/testing/integration/upgrade_fleet_test.go b/testing/integration/upgrade_fleet_test.go
index b638f4570d5..8ced575fc63 100644
--- a/testing/integration/upgrade_fleet_test.go
+++ b/testing/integration/upgrade_fleet_test.go
@@ -17,10 +17,11 @@ import (
 	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/elastic-agent-libs/kibana"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/check"
+	"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
 
 	atesting "github.com/elastic/elastic-agent/pkg/testing"
 	"github.com/elastic/elastic-agent/pkg/testing/define"
-	"github.com/elastic/elastic-agent/pkg/testing/tools"
 	"github.com/elastic/elastic-agent/pkg/version"
 	"github.com/elastic/elastic-agent/testing/upgradetest"
 )
@@ -105,7 +106,7 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info
 	require.NoError(t, err)
 
 	t.Log("Getting default Fleet Server URL...")
-	fleetServerURL, err := tools.GetDefaultFleetServerURL(kibClient)
+	fleetServerURL, err := fleettools.DefaultURL(kibClient)
 	require.NoError(t, err)
 
 	t.Log("Enrolling Elastic Agent...")
@@ -125,7 +126,7 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info
 	require.NoError(t, err, "failed to install start agent [output: %s]", string(output))
 	t.Cleanup(func() {
 		t.Log("Un-enrolling Elastic Agent...")
-		assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID))
+		assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID))
 	})
 
 	t.Log("Waiting for Agent to be correct version and healthy...")
@@ -133,10 +134,10 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info
 	require.NoError(t, err)
 
 	t.Log("Waiting for enrolled Agent status to be online...")
-	require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online")
+	require.Eventually(t, check.FleetAgentStatus(t, kibClient, policy.ID, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online")
 
 	t.Logf("Upgrading from version %q to version %q...", startParsedVersion, endVersionInfo.Binary.String())
-	err = tools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true)
+	err = fleettools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true)
 	require.NoError(t, err)
 
 	// wait for the watcher to show up
@@ -150,12 +151,12 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info
 	require.NoError(t, err)
 
 	t.Log("Waiting for enrolled Agent status to be online...")
-	require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online")
+	require.Eventually(t, check.FleetAgentStatus(t, kibClient, policy.ID, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online")
 
 	// wait for version
 	require.Eventually(t, func() bool {
 		t.Log("Getting Agent version...")
-		newVersion, err := tools.GetAgentVersion(kibClient, policy.ID)
+		newVersion, err := fleettools.GetAgentVersion(kibClient, policy.ID)
 		if err != nil {
 			t.Logf("error getting agent version: %v", err)
 			return false

From 160b034eacb718bfbbf441d6c6f6d00a29ec4afc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paolo=20Chil=C3=A0?= <paolo.chila@elastic.co>
Date: Thu, 5 Oct 2023 17:00:35 +0200
Subject: [PATCH 2/5] Move integration test to ess staging (#3535)

* move default ESS env to staging for integration tests

* Remove region override from integration test buildkite step

* change api key for staging

* fixup! change api key for staging

* fixup! fixup! change api key for staging
---
 .buildkite/hooks/pre-command                  | 5 +++--
 .buildkite/scripts/steps/integration_tests.sh | 2 +-
 magefile.go                                   | 4 ++--
 pkg/testing/ess/client.go                     | 4 ++++
 pkg/testing/ess/config.go                     | 2 +-
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command
index 0fe9b7e3ae6..bb4db5f0948 100755
--- a/.buildkite/hooks/pre-command
+++ b/.buildkite/hooks/pre-command
@@ -17,7 +17,8 @@ DOCKER_REGISTRY="docker.elastic.co"
 DOCKER_REGISTRY_SECRET_PATH="kv/ci-shared/platform-ingest/docker_registry_prod"
 CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
 CI_GCP_OBS_PATH="kv/ci-shared/observability-ingest/cloud/gcp"
-CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa"
+# CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa"
+CI_ESS_STAGING_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-staging"
 CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
 
 
@@ -54,7 +55,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" ==
   export TEST_INTEG_AUTH_GCP_SERVICE_TOKEN_FILE=$(realpath ./gcp.json)
 
   # ESS credentials
-  export API_KEY_TOKEN=$(vault kv get -field api_key ${CI_AGENT_QA_OBS_PATH})
+  export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_STAGING_PATH})
   echo ${API_KEY_TOKEN} > ./apiKey
   export TEST_INTEG_AUTH_ESS_APIKEY_FILE=$(realpath ./apiKey)
 fi
diff --git a/.buildkite/scripts/steps/integration_tests.sh b/.buildkite/scripts/steps/integration_tests.sh
index 9c469841e87..8b1dec76449 100755
--- a/.buildkite/scripts/steps/integration_tests.sh
+++ b/.buildkite/scripts/steps/integration_tests.sh
@@ -18,7 +18,7 @@ AGENT_PACKAGE_VERSION="${OVERRIDE_AGENT_PACKAGE_VERSION}" DEV=true EXTERNAL=true
 
 # Run integration tests
 set +e
-AGENT_VERSION="${OVERRIDE_TEST_AGENT_VERSION}" TEST_INTEG_AUTH_ESS_REGION=azure-eastus2 TEST_INTEG_CLEAN_ON_EXIT=true SNAPSHOT=true mage integration:test
+AGENT_VERSION="${OVERRIDE_TEST_AGENT_VERSION}" TEST_INTEG_CLEAN_ON_EXIT=true SNAPSHOT=true mage integration:test
 TESTS_EXIT_STATUS=$?
 set -e
 
diff --git a/magefile.go b/magefile.go
index 06016bd48ff..183889b3e69 100644
--- a/magefile.go
+++ b/magefile.go
@@ -2094,8 +2094,8 @@ func authESS(ctx context.Context) error {
 
 		fmt.Fprintln(os.Stderr, "❌  ESS authentication unsuccessful. Retrying...")
 
-		prompt := "Please provide a ESS (QA) API key. To get your API key, " +
-			"visit https://console.qa.cld.elstc.co/deployment-features/keys:"
+		prompt := fmt.Sprintf("Please provide a ESS API key for %s. To get your API key, "+
+			"visit %s/deployment-features/keys:", client.BaseURL(), strings.TrimRight(client.BaseURL(), "/api/v1"))
 		essAPIKey, err = stringPrompt(prompt)
 		if err != nil {
 			return fmt.Errorf("unable to read ESS API key from prompt: %w", err)
diff --git a/pkg/testing/ess/client.go b/pkg/testing/ess/client.go
index 42587c6f275..0b618ff586e 100644
--- a/pkg/testing/ess/client.go
+++ b/pkg/testing/ess/client.go
@@ -60,3 +60,7 @@ func (c *Client) doPost(ctx context.Context, relativeUrl, contentType string, bo
 
 	return c.client.Do(req)
 }
+
+func (c *Client) BaseURL() string {
+	return c.config.BaseUrl
+}
diff --git a/pkg/testing/ess/config.go b/pkg/testing/ess/config.go
index 89298dbd341..c90be94caa5 100644
--- a/pkg/testing/ess/config.go
+++ b/pkg/testing/ess/config.go
@@ -18,7 +18,7 @@ type Config struct {
 
 func defaultConfig() *Config {
 	return &Config{
-		BaseUrl: `https://console.qa.cld.elstc.co/api/v1`,
+		BaseUrl: `https://staging.found.no/api/v1`,
 	}
 }
 

From ac2ef57b335ebca28216240508bd773d53009c7a Mon Sep 17 00:00:00 2001
From: Lee E Hinman <57081003+leehinman@users.noreply.github.com>
Date: Thu, 5 Oct 2023 10:01:40 -0500
Subject: [PATCH 3/5] make delayed progress tests more resilient (#3526)

allow progress tests to take more time than expected for CI
---
 internal/pkg/agent/install/progress_test.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/internal/pkg/agent/install/progress_test.go b/internal/pkg/agent/install/progress_test.go
index e74f7ff5666..2bb9bb6b1d3 100644
--- a/internal/pkg/agent/install/progress_test.go
+++ b/internal/pkg/agent/install/progress_test.go
@@ -56,7 +56,7 @@ func TestProgress(t *testing.T) {
 
 		rs.Failed()
 
-		require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3}\.+ FAILED\n`), string(w.buf))
+		require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3,}\.+ FAILED\n`), string(w.buf))
 	})
 
 	t.Run("multi_step_immediate_success", func(t *testing.T) {
@@ -93,7 +93,7 @@ func TestProgress(t *testing.T) {
 
 		rs.Succeeded()
 
-		require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3}\.+ DONE\nstep 2 starting\.{3}\.+ DONE`), string(w.buf))
+		require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3,}\.+ DONE\nstep 2 starting\.{3,}\.+ DONE`), string(w.buf))
 	})
 
 	t.Run("single_step_delay_after_failed", func(t *testing.T) {
@@ -110,7 +110,7 @@ func TestProgress(t *testing.T) {
 
 		rs.Failed()
 
-		require.Equal(t, "step 1 starting... FAILED\n", string(w.buf))
+		require.Regexp(t, regexp.MustCompile(`step 1 starting.{3,} FAILED\n`), string(w.buf))
 
 	})
 
@@ -133,6 +133,6 @@ func TestProgress(t *testing.T) {
 
 		rs.Succeeded()
 
-		require.Regexp(t, regexp.MustCompile(`step starting\.{3}\n   substep 1 starting\.{3}\.+ DONE\n   substep 2 starting\.{3}\.+ DONE\n   DONE\n`), string(w.buf))
+		require.Regexp(t, regexp.MustCompile(`step starting\.{3,}\n   substep 1 starting\.{3,}\.+ DONE\n   substep 2 starting\.{3,}\.+ DONE\n   DONE\n`), string(w.buf))
 	})
 }

From eaf4e9c7995258d424ab4608c6ad2f5b972a9696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paolo=20Chil=C3=A0?= <paolo.chila@elastic.co>
Date: Fri, 6 Oct 2023 07:51:29 +0200
Subject: [PATCH 4/5] fix integration test docs (#3545)

---
 docs/test-framework-dev-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/test-framework-dev-guide.md b/docs/test-framework-dev-guide.md
index 0821f1a5996..3e7ef9383b8 100644
--- a/docs/test-framework-dev-guide.md
+++ b/docs/test-framework-dev-guide.md
@@ -10,7 +10,7 @@ Go version should be at least the same than the one in [.go-version](https://git
 
 ### Configuration
 
-ESS (QA) API Key to create on https://console.qa.cld.elstc.co/deployment-features/keys
+ESS (staging) API Key to create on https://staging.found.no/account/keys
 
 Warning: if you never created a deployment on it, you won't have permission to get this key, so you will need to create one first.
 

From 33c6934057e80f3a4a3d6f1060345262980e334d Mon Sep 17 00:00:00 2001
From: Anderson Queiroz <anderson.queiroz@elastic.co>
Date: Fri, 6 Oct 2023 10:42:39 +0200
Subject: [PATCH 5/5] fix install/enroll cmd to fail when agent restart fails
 (#3207)

* surface errors that might occur during enroll
* fail install command if agent cannot be restarted
* do not print success message if there was an enroll error. Print an error message and the error instead
* add logs to show the different enroll attempts
* add more context t errors
* refactor internal/pkg/agent/install/perms_unix.go and add more context to errors
restore main version
* ignore agent restart error on enroll tests as there is no agent to be restarted
---
 ...-Surface-errors-during-Agent's-enroll.yaml | 32 +++++++++
 internal/pkg/agent/cmd/enroll.go              |  2 +-
 internal/pkg/agent/cmd/enroll_cmd.go          | 62 +++++++++++-----
 internal/pkg/agent/cmd/enroll_cmd_test.go     | 71 +++++++++++++------
 internal/pkg/agent/cmd/install.go             |  4 +-
 internal/pkg/agent/install/perms_unix.go      | 32 +++++----
 6 files changed, 151 insertions(+), 52 deletions(-)
 create mode 100644 changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml

diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml
new file mode 100644
index 00000000000..f8361f99433
--- /dev/null
+++ b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: bug-fix
+
+# Change summary; a 80ish characters long description of the change.
+summary: Surface errors during Agent's enroll process, failing if any happens.
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; a word indicating the component this changeset affects.
+component: install/enroll
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/3207
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go
index 1bce5f7e547..adaa278f32f 100644
--- a/internal/pkg/agent/cmd/enroll.go
+++ b/internal/pkg/agent/cmd/enroll.go
@@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
 	//  Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
 	// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
 	// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
-	var fixPermissions bool = fromInstall
+	fixPermissions := fromInstall
 	if runtime.GOOS == "darwin" {
 		fixPermissions = false
 	}
diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go
index b5992f10188..f43e1483a8a 100644
--- a/internal/pkg/agent/cmd/enroll_cmd.go
+++ b/internal/pkg/agent/cmd/enroll_cmd.go
@@ -172,7 +172,7 @@ func newEnrollCmd(
 	)
 }
 
-// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
+// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
 func newEnrollCmdWithStore(
 	log *logger.Logger,
 	options *enrollCmdOption,
@@ -187,10 +187,11 @@ func newEnrollCmdWithStore(
 	}, nil
 }
 
-// Execute tries to enroll the agent into Fleet.
+// Execute enrolls the agent into Fleet.
 func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
 	var err error
 	defer c.stopAgent() // ensure its stopped no matter what
+
 	span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
 	defer func() {
 		apm.CaptureError(ctx, err).Send()
@@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
 		// Ensure that the agent does not use a proxy configuration
 		// when connecting to the local fleet server.
 		// Note that when running fleet-server the enroll request will be sent to :8220,
-		// however when the agent is running afterwards requests will be sent to :8221
+		// however when the agent is running afterward requests will be sent to :8221
 		c.remoteConfig.Transport.Proxy.Disable = true
 	}
 
@@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
 
 	err = c.enrollWithBackoff(ctx, persistentConfig)
 	if err != nil {
-		return errors.New(err, "fail to enroll")
+		return fmt.Errorf("fail to enroll: %w", err)
 	}
 
 	if c.options.FixPermissions {
@@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
 	}
 
 	defer func() {
-		fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
+		if err != nil {
+			fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
+		} else {
+			fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
+		}
 	}()
 
 	if c.agentProc == nil {
-		if err := c.daemonReload(ctx); err != nil {
-			c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
-		} else {
-			c.log.Info("Successfully triggered restart on running Elastic Agent.")
+		if err = c.daemonReloadWithBackoff(ctx); err != nil {
+			c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
+			return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
 		}
+
+		c.log.Info("Successfully triggered restart on running Elastic Agent.")
 		return nil
 	}
+
 	c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
 	return nil
 }
@@ -443,6 +450,11 @@ func (c *enrollCmd) prepareFleetTLS() error {
 
 func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
 	err := c.daemonReload(ctx)
+	if err != nil &&
+		(errors.Is(err, context.DeadlineExceeded) ||
+			errors.Is(err, context.Canceled)) {
+		return fmt.Errorf("could not reload deamon: %w", err)
+	}
 	if err == nil {
 		return nil
 	}
@@ -450,17 +462,20 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
 	signal := make(chan struct{})
 	backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)
 
-	for i := 5; i >= 0; i-- {
+	var i int
+	for ; i < 5; i++ {
 		backExp.Wait()
 		c.log.Info("Retrying to restart...")
 		err = c.daemonReload(ctx)
-		if err == nil {
+		if err == nil ||
+			errors.Is(err, context.DeadlineExceeded) ||
+			errors.Is(err, context.Canceled) {
 			break
 		}
 	}
 
 	close(signal)
-	return err
+	return fmt.Errorf("could not reload deamon after %d retries: %w", i+1, err)
 }
 
 func (c *enrollCmd) daemonReload(ctx context.Context) error {
@@ -478,8 +493,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
 
 	c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
 	err := c.enroll(ctx, persistentConfig)
+	if err == nil {
+		return nil
+	}
+
+	const deadline = 10 * time.Minute
+	const frequency = 60 * time.Second
+
+	c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
+		deadline,
+		frequency,
+		c.client.URI())
 	signal := make(chan struct{})
-	backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
+	defer close(signal)
+	backExp := backoff.NewExpBackoff(signal, frequency, deadline)
 
 	for {
 		retry := false
@@ -498,7 +525,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
 		err = c.enroll(ctx, persistentConfig)
 	}
 
-	close(signal)
 	return err
 }
 
@@ -547,8 +573,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
 			c.options.FleetServer.ElasticsearchInsecure,
 		)
 		if err != nil {
-			return err
+			return fmt.Errorf(
+				"failed creating fleet-server bootstrap config: %w", err)
 		}
+
 		// no longer need bootstrap at this point
 		serverConfig.Server.Bootstrap = false
 		fleetConfig.Server = serverConfig.Server
@@ -568,11 +596,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
 
 	reader, err := yamlToReader(configToStore)
 	if err != nil {
-		return err
+		return fmt.Errorf("yamlToReader failed: %w", err)
 	}
 
 	if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
-		return err
+		return fmt.Errorf("failed to store agent config: %w", err)
 	}
 
 	// clear action store
diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go
index 189ad7b6563..426924273d1 100644
--- a/internal/pkg/agent/cmd/enroll_cmd_test.go
+++ b/internal/pkg/agent/cmd/enroll_cmd_test.go
@@ -16,8 +16,11 @@ import (
 	"os"
 	"runtime"
 	"strconv"
+	"strings"
 	"testing"
+	"time"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
 	"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
@@ -159,14 +162,24 @@ func TestEnroll(t *testing.T) {
 			require.NoError(t, err)
 
 			streams, _, _, _ := cli.NewTestingIOStreams()
-			err = cmd.Execute(context.Background(), streams)
-			require.NoError(t, err)
+			ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
+			defer cancel()
+			err = cmd.Execute(ctx, streams)
+
+			if err != nil &&
+				// There is no agent running, therefore nothing to be restarted.
+				// However, this will cause the Enroll command to return an error
+				// which we'll ignore here.
+				!strings.Contains(err.Error(),
+					"could not reload agent daemon, unable to trigger restart") {
+				t.Fatalf("enrrol coms returned and unexpected error: %v", err)
+			}
 
 			config, err := readConfig(store.Content)
-
 			require.NoError(t, err)
-			require.Equal(t, "my-access-api-key", config.AccessAPIKey)
-			require.Equal(t, host, config.Client.Host)
+
+			assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
+			assert.Equal(t, host, config.Client.Host)
 		},
 	))
 
@@ -216,16 +229,24 @@ func TestEnroll(t *testing.T) {
 			require.NoError(t, err)
 
 			streams, _, _, _ := cli.NewTestingIOStreams()
-			err = cmd.Execute(context.Background(), streams)
-			require.NoError(t, err)
-
-			require.True(t, store.Called)
-
+			ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
+			defer cancel()
+			err = cmd.Execute(ctx, streams)
+			if err != nil &&
+				// There is no agent running, therefore nothing to be restarted.
+				// However, this will cause the Enroll command to return an error
+				// which we'll ignore here.
+				!strings.Contains(err.Error(),
+					"could not reload agent daemon, unable to trigger restart") {
+				t.Fatalf("enrrol coms returned and unexpected error: %v", err)
+			}
+
+			assert.True(t, store.Called)
 			config, err := readConfig(store.Content)
 
-			require.NoError(t, err)
-			require.Equal(t, "my-access-api-key", config.AccessAPIKey)
-			require.Equal(t, host, config.Client.Host)
+			assert.NoError(t, err)
+			assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
+			assert.Equal(t, host, config.Client.Host)
 		},
 	))
 
@@ -275,16 +296,24 @@ func TestEnroll(t *testing.T) {
 			require.NoError(t, err)
 
 			streams, _, _, _ := cli.NewTestingIOStreams()
-			err = cmd.Execute(context.Background(), streams)
-			require.NoError(t, err)
-
-			require.True(t, store.Called)
-
+			ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
+			defer cancel()
+			err = cmd.Execute(ctx, streams)
+
+			if err != nil &&
+				// There is no agent running, therefore nothing to be restarted.
+				// However, this will cause the Enroll command to return an error
+				// which we'll ignore here.
+				!strings.Contains(err.Error(),
+					"could not reload agent daemon, unable to trigger restart") {
+				t.Fatalf("enrrol coms returned and unexpected error: %v", err)
+			}
+
+			assert.True(t, store.Called)
 			config, err := readConfig(store.Content)
-
 			require.NoError(t, err)
-			require.Equal(t, "my-access-api-key", config.AccessAPIKey)
-			require.Equal(t, host, config.Client.Host)
+			assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
+			assert.Equal(t, host, config.Client.Host)
 		},
 	))
 
diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go
index 4fbd37f40da..2cb46bd599d 100644
--- a/internal/pkg/agent/cmd/install.go
+++ b/internal/pkg/agent/cmd/install.go
@@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
 				return fmt.Errorf("problem reading prompt response")
 			}
 			if url == "" {
-				fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
+				fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
 				return nil
 			}
 		}
@@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
 				}
 			}()
 		}
+
+		fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
 	}
 
 	if enroll {
diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go
index e84dcd5039c..fc357fd4fde 100644
--- a/internal/pkg/agent/install/perms_unix.go
+++ b/internal/pkg/agent/install/perms_unix.go
@@ -8,6 +8,7 @@ package install
 
 import (
 	"errors"
+	"fmt"
 	"io/fs"
 	"os"
 	"path/filepath"
@@ -18,19 +19,26 @@ func fixPermissions(topPath string) error {
 	return recursiveRootPermissions(topPath)
 }
 
-func recursiveRootPermissions(path string) error {
-	return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
-		if err == nil {
-			// all files should be owned by root:root
-			err = os.Chown(name, 0, 0)
-			if err != nil {
-				return err
-			}
-			// remove any world permissions from the file
-			err = os.Chmod(name, info.Mode().Perm()&0770)
-		} else if errors.Is(err, fs.ErrNotExist) {
+func recursiveRootPermissions(root string) error {
+	return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
+		if errors.Is(err, fs.ErrNotExist) {
 			return nil
 		}
-		return err
+		if err != nil {
+			return fmt.Errorf("walk on %q failed: %w", path, err)
+		}
+
+		// all files should be owned by root:root
+		err = os.Chown(path, 0, 0)
+		if err != nil {
+			return fmt.Errorf("could not fix ownership of %q: %w", path, err)
+		}
+		// remove any world permissions from the file
+		err = os.Chmod(path, info.Mode().Perm()&0770)
+		if err != nil {
+			return fmt.Errorf("could not fix permissions of %q: %w", path, err)
+		}
+
+		return nil
 	})
 }