From 2201b6d5499ac84a35b271a9bc5898fb4da3d1df Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Thu, 5 Oct 2023 15:35:44 +0200 Subject: [PATCH 1/5] refactor testing/integration and pkg/testing (#3378) * pkg/testing/tools - split the 'tools' into smaller packages with well defined scope: 'estools', 'fleettools' and 'check' - adjust method names for consistency * testing/integration: - make diagnostics to require to be run on a remote host - add more context to errors - reorder functions on endpoint_security_test so tests are the first functions and when the test fails because the 'Endpoint' directory isn't removed, it now prints the content of the directory --- pkg/component/runtime/manager_test.go | 91 ++++++----- pkg/testing/fixture_install.go | 14 +- pkg/testing/tools/artifacts_api.go | 6 +- pkg/testing/tools/check/check.go | 29 +++- pkg/testing/tools/cmd.go | 32 ---- .../tools/{ => estools}/elasticsearch.go | 2 +- .../tools/{agents.go => fleettools/fleet.go} | 47 ++---- pkg/testing/tools/tools.go | 69 ++++---- testing/integration/diagnostics_test.go | 4 +- testing/integration/endpoint_security_test.go | 152 ++++++++++-------- testing/integration/fqdn_test.go | 17 +- testing/integration/monitoring_logs_test.go | 24 +-- testing/integration/proxy_url_test.go | 12 +- testing/integration/upgrade_fleet_test.go | 15 +- 14 files changed, 263 insertions(+), 251 deletions(-) delete mode 100644 pkg/testing/tools/cmd.go rename pkg/testing/tools/{ => estools}/elasticsearch.go (99%) rename pkg/testing/tools/{agents.go => fleettools/fleet.go} (82%) diff --git a/pkg/component/runtime/manager_test.go b/pkg/component/runtime/manager_test.go index d00df10db64..cbce15b13c5 100644 --- a/pkg/component/runtime/manager_test.go +++ b/pkg/component/runtime/manager_test.go @@ -1371,52 +1371,59 @@ func TestManager_FakeInput_GoodUnitToBad(t *testing.T) { t.Logf("component state changed: %+v", state) if state.State == client.UnitStateFailed { subErrCh <- fmt.Errorf("component failed: %s", state.Message) + return + } + + unit, ok := state.Units[ComponentUnitKey{UnitType: client.UnitTypeInput, UnitID: "good-input"}] + if !ok { + subErrCh <- errors.New("unit missing: good-input") + return + } + if unitGood { + switch unit.State { + case client.UnitStateFailed: + subErrCh <- fmt.Errorf("unit failed: %s", unit.Message) + return + case client.UnitStateHealthy: + // good unit it; now make it bad + t.Logf("marking good-input as having a hard-error for config") + updatedComp := comp + updatedComp.Units = make([]component.Unit, len(comp.Units)) + copy(updatedComp.Units, comp.Units) + updatedComp.Units[1] = component.Unit{ + ID: "good-input", + Type: client.UnitTypeInput, + Err: errors.New("hard-error for config"), + } + unitGood = false + err := m.Update(component.Model{Components: []component.Component{updatedComp}}) + if err != nil { + subErrCh <- err + } + case client.UnitStateStarting: + // acceptable + default: + // unknown state that should not have occurred + subErrCh <- fmt.Errorf("unit reported unexpected state: %v", unit.State) + } } else { - unit, ok := state.Units[ComponentUnitKey{UnitType: client.UnitTypeInput, UnitID: "good-input"}] - if ok { - if unitGood { - if unit.State == client.UnitStateFailed { - subErrCh <- fmt.Errorf("unit failed: %s", unit.Message) - } else if unit.State == client.UnitStateHealthy { - // good unit it; now make it bad - t.Logf("marking good-input as having a hard-error for config") - updatedComp := comp - updatedComp.Units = make([]component.Unit, len(comp.Units)) - copy(updatedComp.Units, comp.Units) - updatedComp.Units[1] = component.Unit{ - ID: "good-input", - Type: client.UnitTypeInput, - Err: errors.New("hard-error for config"), - } - unitGood = false - err := m.Update(component.Model{Components: []component.Component{updatedComp}}) - if err != nil { - subErrCh <- err - } - } else if unit.State == client.UnitStateStarting { - // acceptable - } else { - // unknown state that should not have occurred - subErrCh <- fmt.Errorf("unit reported unexpected state: %v", unit.State) - } - } else { - if unit.State == client.UnitStateFailed { - // went to failed; stop whole component - err := m.Update(component.Model{Components: []component.Component{}}) - if err != nil { - subErrCh <- err - } - } else if unit.State == client.UnitStateStopped { - // unit was stopped - subErrCh <- nil - } else { - subErrCh <- errors.New("good-input unit should be either failed or stopped") - } + switch unit.State { + case client.UnitStateFailed: + // went to failed; stop whole component + err := m.Update(component.Model{Components: []component.Component{}}) + if err != nil { + subErrCh <- err } - } else { - subErrCh <- errors.New("unit missing: good-input") + case client.UnitStateStopped: + // unit was stopped + subErrCh <- nil + default: + subErrCh <- fmt.Errorf( + "good-input unit should be either failed or stopped, but it's '%s'", + unit.State) } } + } } }() diff --git a/pkg/testing/fixture_install.go b/pkg/testing/fixture_install.go index 2e515213533..9ec684a811b 100644 --- a/pkg/testing/fixture_install.go +++ b/pkg/testing/fixture_install.go @@ -84,8 +84,12 @@ func (i InstallOpts) toCmdArgs() []string { return args } -// Install installs the prepared Elastic Agent binary and returns: -// - the combined output of stdout and stderr +// Install installs the prepared Elastic Agent binary and registers a t.Cleanup +// function to uninstall the agent if it hasn't been uninstalled. It also takes +// care of collecting a diagnostics when AGENT_COLLECT_DIAG=true or the test +// has failed. +// It returns: +// - the combined output of Install command stdout and stderr // - an error if any. func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts ...process.CmdOption) ([]byte, error) { f.t.Logf("[test %s] Inside fixture install function", f.t.Name()) @@ -134,12 +138,12 @@ func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts .. // environment variable AGENT_KEEP_INSTALLED=true will skip the uninstall // useful to debug the issue with the Elastic Agent - if f.t.Failed() && keepInstalled() { + if f.t.Failed() && keepInstalledFlag() { f.t.Logf("skipping uninstall; test failed and AGENT_KEEP_INSTALLED=true") return } - if keepInstalled() { + if keepInstalledFlag() { f.t.Logf("ignoring AGENT_KEEP_INSTALLED=true as test succeeded, " + "keeping the agent installed will jeopardise other tests") } @@ -327,7 +331,7 @@ func collectDiagFlag() bool { return v } -func keepInstalled() bool { +func keepInstalledFlag() bool { // failure reports false (ignore error) v, _ := strconv.ParseBool(os.Getenv("AGENT_KEEP_INSTALLED")) return v diff --git a/pkg/testing/tools/artifacts_api.go b/pkg/testing/tools/artifacts_api.go index 84fdf226df1..0e91d212af3 100644 --- a/pkg/testing/tools/artifacts_api.go +++ b/pkg/testing/tools/artifacts_api.go @@ -135,7 +135,7 @@ type ArtifactAPIClient struct { url string } -// Creates a new Artifact API client +// NewArtifactAPIClient creates a new Artifact API client func NewArtifactAPIClient(opts ...ArtifactAPIClientOpt) *ArtifactAPIClient { c := &ArtifactAPIClient{ url: defaultArtifactAPIURL, @@ -204,7 +204,7 @@ func (aac ArtifactAPIClient) GetBuildDetails(ctx context.Context, version string return checkResponseAndUnmarshal[BuildDetails](resp) } -func (aac *ArtifactAPIClient) composeURL(relativePath string) (string, error) { +func (aac ArtifactAPIClient) composeURL(relativePath string) (string, error) { joinedURL, err := url.JoinPath(aac.url, relativePath) if err != nil { return "", fmt.Errorf("composing URL with %q %q: %w", aac.url, relativePath, err) @@ -213,7 +213,7 @@ func (aac *ArtifactAPIClient) composeURL(relativePath string) (string, error) { return joinedURL, nil } -func (aac *ArtifactAPIClient) createAndPerformRequest(ctx context.Context, URL string) (*http.Response, error) { +func (aac ArtifactAPIClient) createAndPerformRequest(ctx context.Context, URL string) (*http.Response, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, URL, nil) if err != nil { err = fmt.Errorf("composing request: %w", err) diff --git a/pkg/testing/tools/check/check.go b/pkg/testing/tools/check/check.go index b5815fce255..03f95b3e17e 100644 --- a/pkg/testing/tools/check/check.go +++ b/pkg/testing/tools/check/check.go @@ -11,14 +11,16 @@ import ( "github.com/stretchr/testify/assert" + "github.com/elastic/elastic-agent-libs/kibana" "github.com/elastic/elastic-agent/pkg/control/v2/cproto" integrationtest "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" ) // ConnectedToFleet checks if the agent defined in the fixture is connected to // Fleet Server. It uses assert.Eventually and if it fails the last error will // be printed. It returns if the agent is connected to Fleet Server or not. -func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool { +func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture, timeout time.Duration) bool { t.Helper() var err error @@ -28,7 +30,7 @@ func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool { return agentStatus.FleetState == int(cproto.State_HEALTHY) } - connected := assert.Eventually(t, assertFn, 5*time.Minute, 5*time.Second, + connected := assert.Eventually(t, assertFn, timeout, 5*time.Second, "want fleet state %s, got %s. agent status: %v", cproto.State_HEALTHY, cproto.State(agentStatus.FleetState), agentStatus) @@ -39,3 +41,26 @@ func ConnectedToFleet(t *testing.T, fixture *integrationtest.Fixture) bool { return connected } + +// FleetAgentStatus returns a niladic function that returns true if the agent +// has reached expectedStatus; false otherwise. The returned function is intended +// for use with assert.Eventually or require.Eventually. +func FleetAgentStatus(t *testing.T, + client *kibana.Client, + policyID, + expectedStatus string) func() bool { + return func() bool { + currentStatus, err := fleettools.GetAgentStatus(client, policyID) + if err != nil { + t.Errorf("unable to determine agent status: %s", err.Error()) + return false + } + + if currentStatus == expectedStatus { + return true + } + + t.Logf("Agent fleet status: %s", currentStatus) + return false + } +} diff --git a/pkg/testing/tools/cmd.go b/pkg/testing/tools/cmd.go deleted file mode 100644 index 9277c3ea5c6..00000000000 --- a/pkg/testing/tools/cmd.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -// or more contributor license agreements. Licensed under the Elastic License; -// you may not use this file except in compliance with the Elastic License. - -package tools - -import ( - "context" - "time" - - atesting "github.com/elastic/elastic-agent/pkg/testing" -) - -// InstallAgent force install the Elastic Agent through agentFixture. -func InstallAgent(ctx context.Context, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture) ([]byte, error) { - // 2 minute timeout, to ensure that it at least doesn't get stuck. - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) - defer cancel() - return agentFixture.Install(ctx, &installOpts) -} - -// InstallStandaloneAgent force install the Elastic Agent through agentFixture. -func InstallStandaloneAgent(ctx context.Context, agentFixture *atesting.Fixture) ([]byte, error) { - // 2 minute timeout, to ensure that it at least doesn't get stuck. - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) - defer cancel() - installOpts := atesting.InstallOpts{ - NonInteractive: true, - Force: true, - } - return agentFixture.Install(ctx, &installOpts) -} diff --git a/pkg/testing/tools/elasticsearch.go b/pkg/testing/tools/estools/elasticsearch.go similarity index 99% rename from pkg/testing/tools/elasticsearch.go rename to pkg/testing/tools/estools/elasticsearch.go index 0d0b040d4dd..8cd6e126597 100644 --- a/pkg/testing/tools/elasticsearch.go +++ b/pkg/testing/tools/estools/elasticsearch.go @@ -2,7 +2,7 @@ // or more contributor license agreements. Licensed under the Elastic License; // you may not use this file except in compliance with the Elastic License. -package tools +package estools import ( "bytes" diff --git a/pkg/testing/tools/agents.go b/pkg/testing/tools/fleettools/fleet.go similarity index 82% rename from pkg/testing/tools/agents.go rename to pkg/testing/tools/fleettools/fleet.go index e440a927846..baa1ca659c0 100644 --- a/pkg/testing/tools/agents.go +++ b/pkg/testing/tools/fleettools/fleet.go @@ -2,21 +2,15 @@ // or more contributor license agreements. Licensed under the Elastic License; // you may not use this file except in compliance with the Elastic License. -package tools +package fleettools import ( "context" "errors" "fmt" "os" - "testing" - "time" - - "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/kibana" - "github.com/elastic/elastic-agent/pkg/control/v2/client" - "github.com/elastic/elastic-agent/pkg/control/v2/cproto" ) // GetAgentByPolicyIDAndHostnameFromList get an agent by the local_metadata.host.name property, reading from the agents list @@ -47,6 +41,14 @@ func GetAgentByPolicyIDAndHostnameFromList(client *kibana.Client, policyID, host return hostnameAgents[0], nil } +func GetAgentIDByHostname(client *kibana.Client, policyID, hostname string) (string, error) { + agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname) + if err != nil { + return "", err + } + return agent.Agent.ID, nil +} + func GetAgentStatus(client *kibana.Client, policyID string) (string, error) { hostname, err := os.Hostname() if err != nil { @@ -97,15 +99,8 @@ func UnEnrollAgent(client *kibana.Client, policyID string) error { return nil } -func GetAgentIDByHostname(client *kibana.Client, policyID, hostname string) (string, error) { - agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname) - if err != nil { - return "", err - } - return agent.Agent.ID, nil -} - func UpgradeAgent(client *kibana.Client, policyID, version string, force bool) error { + // TODO: fix me: this does not work if FQDN is enabled hostname, err := os.Hostname() if err != nil { return err @@ -128,7 +123,7 @@ func UpgradeAgent(client *kibana.Client, policyID, version string, force bool) e return nil } -func GetDefaultFleetServerURL(client *kibana.Client) (string, error) { +func DefaultURL(client *kibana.Client) (string, error) { req := kibana.ListFleetServerHostsRequest{} resp, err := client.ListFleetServerHosts(context.Background(), req) if err != nil { @@ -144,23 +139,5 @@ func GetDefaultFleetServerURL(client *kibana.Client) (string, error) { } } - return "", errors.New("unable to determine default fleet server host") -} - -func WaitForAgent(ctx context.Context, t *testing.T, c client.Client) { - require.Eventually(t, func() bool { - err := c.Connect(ctx) - if err != nil { - t.Logf("connecting client to agent: %v", err) - return false - } - defer c.Disconnect() - state, err := c.State(ctx) - if err != nil { - t.Logf("error getting the agent state: %v", err) - return false - } - t.Logf("agent state: %+v", state) - return state.State == cproto.State_HEALTHY - }, 2*time.Minute, 10*time.Second, "Agent never became healthy") + return "", errors.New("unable to determine default fleet server URL") } diff --git a/pkg/testing/tools/tools.go b/pkg/testing/tools/tools.go index 6972d5228f6..666657a5795 100644 --- a/pkg/testing/tools/tools.go +++ b/pkg/testing/tools/tools.go @@ -10,50 +10,39 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/elastic/elastic-agent-libs/kibana" atesting "github.com/elastic/elastic-agent/pkg/testing" - - "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent/pkg/testing/tools/check" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" ) -// WaitForAgentStatus returns a niladic function that returns true if the agent -// has reached expectedStatus; false otherwise. The returned function is intended -// for use with assert.Eventually or require.Eventually. -func WaitForAgentStatus(t *testing.T, client *kibana.Client, policyID string, expectedStatus string) func() bool { - return func() bool { - currentStatus, err := GetAgentStatus(client, policyID) - if err != nil { - t.Errorf("unable to determine agent status: %s", err.Error()) - return false - } - - if currentStatus == expectedStatus { - return true - } - - t.Logf("Agent status: %s", currentStatus) - return false - } -} - -// WaitForPolicyRevision returns a niladic function that returns true if the +// IsPolicyRevision returns a niladic function that returns true if the // given agent's policy revision has reached the given policy revision; false // otherwise. The returned function is intended // for use with assert.Eventually or require.Eventually. -func WaitForPolicyRevision(t *testing.T, client *kibana.Client, agentID string, expectedPolicyRevision int) func() bool { +func IsPolicyRevision(t *testing.T, client *kibana.Client, agentID string, policyRevision int) func() bool { return func() bool { getAgentReq := kibana.GetAgentRequest{ID: agentID} updatedPolicyAgent, err := client.GetAgent(context.Background(), getAgentReq) - require.NoError(t, err) + if err != nil { + t.Logf("failed to get agent document to check policy revision: %v", err) + return false + } - return updatedPolicyAgent.PolicyRevision == expectedPolicyRevision + return updatedPolicyAgent.PolicyRevision == policyRevision } } // InstallAgentWithPolicy creates the given policy, enrolls the given agent // fixture in Fleet using the default Fleet Server, waits for the agent to be // online, and returns the created policy. -func InstallAgentWithPolicy(ctx context.Context, t *testing.T, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture, kibClient *kibana.Client, createPolicyReq kibana.AgentPolicy) (kibana.PolicyResponse, error) { +func InstallAgentWithPolicy(ctx context.Context, t *testing.T, + installOpts atesting.InstallOpts, + agentFixture *atesting.Fixture, + kibClient *kibana.Client, + createPolicyReq kibana.AgentPolicy) (kibana.PolicyResponse, error) { t.Helper() // Create policy @@ -110,7 +99,7 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T, } // Get default Fleet Server URL - fleetServerURL, err := GetDefaultFleetServerURL(kibClient) + fleetServerURL, err := fleettools.DefaultURL(kibClient) if err != nil { return fmt.Errorf("unable to get default Fleet Server URL: %w", err) } @@ -121,7 +110,8 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T, URL: fleetServerURL, EnrollmentToken: enrollmentToken.APIKey, } - output, err := InstallAgent(ctx, installOpts, agentFixture) + + output, err := agentFixture.Install(ctx, &installOpts) if err != nil { t.Log(string(output)) return fmt.Errorf("unable to enroll Elastic Agent: %w", err) @@ -132,10 +122,18 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T, if deadline, ok := ctx.Deadline(); ok { timeout = time.Until(deadline) } + + assert.Eventually( + t, + check.FleetAgentStatus(t, kibClient, policyID, "online"), + timeout, + 10*time.Second, + "Elastic Agent status is not online", + ) // Wait for Agent to be healthy - require.Eventually( + assert.Eventually( t, - WaitForAgentStatus(t, kibClient, policyID, "online"), + check.FleetAgentStatus(t, kibClient, policyID, "online"), timeout, 10*time.Second, "Elastic Agent status is not online", @@ -143,3 +141,12 @@ func InstallAgentForPolicy(ctx context.Context, t *testing.T, return nil } + +// InstallStandaloneAgent force install the Elastic Agent through agentFixture. +func InstallStandaloneAgent(agentFixture *atesting.Fixture) ([]byte, error) { + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + } + return agentFixture.Install(context.Background(), &installOpts) +} diff --git a/testing/integration/diagnostics_test.go b/testing/integration/diagnostics_test.go index 9b181c0075d..00d8d97aed0 100644 --- a/testing/integration/diagnostics_test.go +++ b/testing/integration/diagnostics_test.go @@ -88,7 +88,7 @@ type componentAndUnitNames struct { func TestDiagnosticsOptionalValues(t *testing.T) { define.Require(t, define.Requirements{ - Local: true, + Local: false, }) fixture, err := define.NewFixture(t, define.Version()) @@ -113,7 +113,7 @@ func TestDiagnosticsOptionalValues(t *testing.T) { func TestDiagnosticsCommand(t *testing.T) { define.Require(t, define.Requirements{ - Local: true, + Local: false, }) f, err := define.NewFixture(t, define.Version()) diff --git a/testing/integration/endpoint_security_test.go b/testing/integration/endpoint_security_test.go index 04ff8fbedb4..f3943604c1d 100644 --- a/testing/integration/endpoint_security_test.go +++ b/testing/integration/endpoint_security_test.go @@ -19,11 +19,10 @@ import ( "text/template" "time" + "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/google/uuid" - "github.com/elastic/elastic-agent-libs/kibana" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/pkg/control/v2/client" @@ -31,6 +30,7 @@ import ( atesting "github.com/elastic/elastic-agent/pkg/testing" "github.com/elastic/elastic-agent/pkg/testing/define" "github.com/elastic/elastic-agent/pkg/testing/tools" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" "github.com/elastic/elastic-agent/pkg/testing/tools/testcontext" ) @@ -90,6 +90,60 @@ func TestInstallAndCLIUninstallWithEndpointSecurity(t *testing.T) { } } +// Tests that the agent can install and uninstall the endpoint-security service while remaining +// healthy. In this case endpoint-security is uninstalled because the agent was unenrolled, which +// triggers the creation of an empty agent policy removing all inputs (only when not force +// unenrolling). The empty agent policy triggers the uninstall of endpoint because endpoint was +// removed from the policy. +// +// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test +// but at this point endpoint is already uninstalled. +func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, // requires Agent installation + Isolate: false, + Sudo: true, // requires Agent installation + OS: []define.OS{ + {Type: define.Linux}, + }, + }) + + for _, tc := range protectionTests { + t.Run(tc.name, func(t *testing.T) { + testInstallAndUnenrollWithEndpointSecurity(t, info, tc.protected) + }) + } +} + +// Tests that the agent can install and uninstall the endpoint-security service +// after the Elastic Defend integration was removed from the policy +// while remaining healthy. +// +// Installing endpoint-security requires a Fleet managed agent with the Elastic Defend integration +// installed. The endpoint-security service is uninstalled the Elastic Defend integration was removed from the policy. +// +// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test +// but at this point endpoint should be already uninstalled. + +func TestInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, // requires Agent installation + Isolate: false, + Sudo: true, // requires Agent installation + OS: []define.OS{ + {Type: define.Linux}, + }, + }) + + for _, tc := range protectionTests { + t.Run(tc.name, func(t *testing.T) { + testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t, info, tc.protected) + }) + } +} + // buildPolicyWithTamperProtection helper function to build the policy request with or without tamper protection func buildPolicyWithTamperProtection(policy kibana.AgentPolicy, protected bool) kibana.AgentPolicy { if protected { @@ -109,7 +163,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I // Get path to agent executable. fixture, err := define.NewFixture(t, define.Version()) - require.NoError(t, err) + require.NoError(t, err, "could not create agent fixture") t.Log("Enrolling the agent in Fleet") policyUUID := uuid.New().String() @@ -138,7 +192,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I t.Cleanup(func() { t.Log("Un-enrolling Elastic Agent...") - assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID)) }) t.Log("Installing Elastic Defend") @@ -151,7 +205,7 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I agentClient := fixture.Client() err = agentClient.Connect(ctx) - require.NoError(t, err) + require.NoError(t, err, "could not connect to local agent") require.Eventually(t, func() bool { return agentAndEndpointAreHealthy(t, ctx, agentClient) }, @@ -162,32 +216,6 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I t.Log("Verified endpoint component and units are healthy") } -// Tests that the agent can install and uninstall the endpoint-security service while remaining -// healthy. In this case endpoint-security is uninstalled because the agent was unenrolled, which -// triggers the creation of an empty agent policy removing all inputs (only when not force -// unenrolling). The empty agent policy triggers the uninstall of endpoint because endpoint was -// removed from the policy. -// -// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test -// but at this point endpoint is already uninstalled. -func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) { - info := define.Require(t, define.Requirements{ - Stack: &define.Stack{}, - Local: false, // requires Agent installation - Isolate: false, - Sudo: true, // requires Agent installation - OS: []define.OS{ - {Type: define.Linux}, - }, - }) - - for _, tc := range protectionTests { - t.Run(tc.name, func(t *testing.T) { - testInstallAndUnenrollWithEndpointSecurity(t, info, tc.protected) - }) - } -} - func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info, protected bool) { // Get path to agent executable. fixture, err := define.NewFixture(t, define.Version()) @@ -244,7 +272,7 @@ func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info, hostname, err := os.Hostname() require.NoError(t, err) - agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) + agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) require.NoError(t, err) _, err = info.KibanaClient.UnEnrollAgent(ctx, kibana.UnEnrollAgentRequest{ID: agentID}) @@ -299,34 +327,6 @@ func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info, } } -// Tests that the agent can install and uninstall the endpoint-security service -// after the Elastic Defend integration was removed from the policy -// while remaining healthy. -// -// Installing endpoint-security requires a Fleet managed agent with the Elastic Defend integration -// installed. The endpoint-security service is uninstalled the Elastic Defend integration was removed from the policy. -// -// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test -// but at this point endpoint should be already uninstalled. - -func TestInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T) { - info := define.Require(t, define.Requirements{ - Stack: &define.Stack{}, - Local: false, // requires Agent installation - Isolate: false, - Sudo: true, // requires Agent installation - OS: []define.OS{ - {Type: define.Linux}, - }, - }) - - for _, tc := range protectionTests { - t.Run(tc.name, func(t *testing.T) { - testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t, info, tc.protected) - }) - } -} - func testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T, info *define.Info, protected bool) { // Get path to agent executable. fixture, err := define.NewFixture(t, define.Version()) @@ -404,7 +404,29 @@ func testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T, i } t.Log("Found directory", f.Name()) - require.False(t, strings.Contains(f.Name(), "Endpoint"), "Endpoint directory was not removed") + // If Endpoint was not currently removed, let's see what was left + if strings.Contains(f.Name(), "Endpoint") { + info, err := f.Info() + if err != nil { + t.Logf("could not get file info for %q to check what was left"+ + "behind: %v", f.Name(), err) + } + ls, err := os.ReadDir(info.Name()) + if err != nil { + t.Logf("could not list fileson for %q to check what was left"+ + "behind: %v", f.Name(), err) + } + var dirEntries []string + for _, de := range ls { + dirEntries = append(dirEntries, de.Name()) + } + + if len(dirEntries) == 0 { + t.Fatalf("Endpoint directory was not removed, but it's empty") + } + t.Fatalf("Endpoint directory was not removed, the directory content is: %s", + strings.Join(dirEntries, ", ")) + } } } @@ -545,7 +567,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c } if state.State != client.Healthy { - t.Logf("Agent is not Healthy\n%+v", state) + t.Logf("local Agent is not Healthy: current state: %+v", state) return false } @@ -554,7 +576,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c for _, comp := range state.Components { isEndpointComponent := strings.Contains(comp.Name, "endpoint") if comp.State != client.Healthy { - t.Logf("Component is not Healthy\n%+v", comp) + t.Logf("endpoint component is not Healthy: current state: %+v", comp) return false } @@ -569,7 +591,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c } if unit.State != client.Healthy { - t.Logf("Unit is not Healthy\n%+v", unit) + t.Logf("unit %q is not Healthy\n%+v", unit.UnitID, unit) return false } } @@ -577,7 +599,7 @@ func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient c // Ensure both the endpoint input and output units were found and healthy. if !foundEndpointInputUnit || !foundEndpointOutputUnit { - t.Logf("State did not contain endpoint units!\n%+v", state) + t.Logf("State did not contain endpoint units. state: %+v", state) return false } diff --git a/testing/integration/fqdn_test.go b/testing/integration/fqdn_test.go index d1c5142acf5..9bb1b34de89 100644 --- a/testing/integration/fqdn_test.go +++ b/testing/integration/fqdn_test.go @@ -19,16 +19,15 @@ import ( "testing" "time" - "github.com/elastic/go-elasticsearch/v8" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/kibana" - atesting "github.com/elastic/elastic-agent/pkg/testing" "github.com/elastic/elastic-agent/pkg/testing/define" "github.com/elastic/elastic-agent/pkg/testing/tools" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" + "github.com/elastic/go-elasticsearch/v8" ) func TestFQDN(t *testing.T) { @@ -90,7 +89,7 @@ func TestFQDN(t *testing.T) { t.Cleanup(func() { t.Log("Un-enrolling Elastic Agent...") - assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID)) t.Log("Restoring hostname...") err := setHostname(context.Background(), origHostname, t.Log) @@ -127,7 +126,7 @@ func TestFQDN(t *testing.T) { expectedAgentPolicyRevision := agent.PolicyRevision + 1 require.Eventually( t, - tools.WaitForPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision), + tools.IsPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision), 2*time.Minute, 1*time.Second, ) @@ -158,7 +157,7 @@ func TestFQDN(t *testing.T) { expectedAgentPolicyRevision++ require.Eventually( t, - tools.WaitForPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision), + tools.IsPolicyRevision(t, kibClient, agent.ID, expectedAgentPolicyRevision), 2*time.Minute, 1*time.Second, ) @@ -182,7 +181,7 @@ func verifyAgentName(t *testing.T, policyID, hostname string, kibClient *kibana. require.Eventually( t, func() bool { - agent, err = tools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policyID, hostname) + agent, err = fleettools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policyID, hostname) return err == nil && agent != nil }, 5*time.Minute, diff --git a/testing/integration/monitoring_logs_test.go b/testing/integration/monitoring_logs_test.go index 5451b57cb61..97836c7ff3f 100644 --- a/testing/integration/monitoring_logs_test.go +++ b/testing/integration/monitoring_logs_test.go @@ -24,6 +24,8 @@ import ( "github.com/elastic/elastic-agent/pkg/testing/define" "github.com/elastic/elastic-agent/pkg/testing/tools" "github.com/elastic/elastic-agent/pkg/testing/tools/check" + "github.com/elastic/elastic-agent/pkg/testing/tools/estools" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" ) func TestMonitoringLogsShipped(t *testing.T) { @@ -69,11 +71,11 @@ func TestMonitoringLogsShipped(t *testing.T) { require.NoError(t, err) t.Logf("created policy: %s", policy.ID) - check.ConnectedToFleet(t, agentFixture) + check.ConnectedToFleet(t, agentFixture, 5*time.Minute) // Stage 2: check indices // This is mostly for debugging - resp, err := tools.GetAllindicies(info.ESClient) + resp, err := estools.GetAllindicies(info.ESClient) require.NoError(t, err) for _, run := range resp { t.Logf("%s: %d/%d deleted: %d\n", @@ -82,8 +84,8 @@ func TestMonitoringLogsShipped(t *testing.T) { // Stage 3: Make sure metricbeat logs are populated t.Log("Making sure metricbeat logs are populated") - docs := findESDocs(t, func() (tools.Documents, error) { - return tools.GetLogsForDatastream(info.ESClient, "elastic_agent.metricbeat") + docs := findESDocs(t, func() (estools.Documents, error) { + return estools.GetLogsForDatastream(info.ESClient, "elastic_agent.metricbeat") }) require.NotZero(t, len(docs.Hits.Hits)) t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits)) @@ -101,8 +103,8 @@ func TestMonitoringLogsShipped(t *testing.T) { // Stage 5: Make sure we have message confirming central management is running t.Log("Making sure we have message confirming central management is running") - docs = findESDocs(t, func() (tools.Documents, error) { - return tools.FindMatchingLogLines(info.ESClient, info.Namespace, + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.FindMatchingLogLines(info.ESClient, info.Namespace, "Parsed configuration and determined agent is managed by Fleet") }) require.NotZero(t, len(docs.Hits.Hits)) @@ -114,7 +116,7 @@ func TestMonitoringLogsShipped(t *testing.T) { t.Fatalf("could not get hostname to filter Agent: %s", err) } - agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) + agentID, err := fleettools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) require.NoError(t, err, "could not get Agent ID by hostname") t.Logf("Agent ID: %q", agentID) @@ -122,8 +124,8 @@ func TestMonitoringLogsShipped(t *testing.T) { // this field is not mapped. There is an issue for that: // https://github.com/elastic/integrations/issues/6545 - docs = findESDocs(t, func() (tools.Documents, error) { - return tools.GetLogsForAgentID(info.ESClient, agentID) + docs = findESDocs(t, func() (estools.Documents, error) { + return estools.GetLogsForAgentID(info.ESClient, agentID) }) require.NoError(t, err, "could not get logs from Agent ID: %q, err: %s", agentID, err) @@ -151,8 +153,8 @@ func TestMonitoringLogsShipped(t *testing.T) { } } -func findESDocs(t *testing.T, findFn func() (tools.Documents, error)) tools.Documents { - var docs tools.Documents +func findESDocs(t *testing.T, findFn func() (estools.Documents, error)) estools.Documents { + var docs estools.Documents require.Eventually( t, diff --git a/testing/integration/proxy_url_test.go b/testing/integration/proxy_url_test.go index f37d3db945b..e95469405be 100644 --- a/testing/integration/proxy_url_test.go +++ b/testing/integration/proxy_url_test.go @@ -130,7 +130,7 @@ func TestProxyURL_EnrollProxyAndNoProxyInThePolicy(t *testing.T) { require.NoError(t, err, "failed to install agent") } - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) } func TestProxyURL_EnrollProxyAndEmptyProxyInThePolicy(t *testing.T) { @@ -175,7 +175,7 @@ func TestProxyURL_EnrollProxyAndEmptyProxyInThePolicy(t *testing.T) { require.NoError(t, err, "failed to install agent") } - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) } func TestProxyURL_ProxyInThePolicyTakesPrecedence(t *testing.T) { @@ -220,7 +220,7 @@ func TestProxyURL_ProxyInThePolicyTakesPrecedence(t *testing.T) { require.NoError(t, err, "failed to install agent") } - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) // ensure the agent is communicating through the proxy set in the policy want := fleetservertest.NewPathCheckin(p.policyData.AgentID) @@ -282,7 +282,7 @@ func TestProxyURL_NoEnrollProxyAndProxyInThePolicy(t *testing.T) { require.NoError(t, err, "failed to install agent") } - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) // ensure the agent is communicating through the new proxy if !assert.Eventually(t, func() bool { @@ -343,7 +343,7 @@ func TestProxyURL_RemoveProxyFromThePolicy(t *testing.T) { } // assert the agent is actually connected to fleet. - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) // ensure the agent is communicating through the proxy set in the policy if !assert.Eventually(t, func() bool { @@ -389,7 +389,7 @@ func TestProxyURL_RemoveProxyFromThePolicy(t *testing.T) { assert.Equal(t, inspect.Fleet.ProxyURL, want) // assert, again, the agent is actually connected to fleet. - check.ConnectedToFleet(t, p.fixture) + check.ConnectedToFleet(t, p.fixture, 5*time.Minute) } func (p *ProxyURL) setupFleet(t *testing.T, fleetHost string) { diff --git a/testing/integration/upgrade_fleet_test.go b/testing/integration/upgrade_fleet_test.go index b638f4570d5..8ced575fc63 100644 --- a/testing/integration/upgrade_fleet_test.go +++ b/testing/integration/upgrade_fleet_test.go @@ -17,10 +17,11 @@ import ( "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/kibana" + "github.com/elastic/elastic-agent/pkg/testing/tools/check" + "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" atesting "github.com/elastic/elastic-agent/pkg/testing" "github.com/elastic/elastic-agent/pkg/testing/define" - "github.com/elastic/elastic-agent/pkg/testing/tools" "github.com/elastic/elastic-agent/pkg/version" "github.com/elastic/elastic-agent/testing/upgradetest" ) @@ -105,7 +106,7 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info require.NoError(t, err) t.Log("Getting default Fleet Server URL...") - fleetServerURL, err := tools.GetDefaultFleetServerURL(kibClient) + fleetServerURL, err := fleettools.DefaultURL(kibClient) require.NoError(t, err) t.Log("Enrolling Elastic Agent...") @@ -125,7 +126,7 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info require.NoError(t, err, "failed to install start agent [output: %s]", string(output)) t.Cleanup(func() { t.Log("Un-enrolling Elastic Agent...") - assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + assert.NoError(t, fleettools.UnEnrollAgent(info.KibanaClient, policy.ID)) }) t.Log("Waiting for Agent to be correct version and healthy...") @@ -133,10 +134,10 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info require.NoError(t, err) t.Log("Waiting for enrolled Agent status to be online...") - require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online") + require.Eventually(t, check.FleetAgentStatus(t, kibClient, policy.ID, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online") t.Logf("Upgrading from version %q to version %q...", startParsedVersion, endVersionInfo.Binary.String()) - err = tools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true) + err = fleettools.UpgradeAgent(kibClient, policy.ID, endVersionInfo.Binary.String(), true) require.NoError(t, err) // wait for the watcher to show up @@ -150,12 +151,12 @@ func testUpgradeFleetManagedElasticAgent(ctx context.Context, t *testing.T, info require.NoError(t, err) t.Log("Waiting for enrolled Agent status to be online...") - require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online") + require.Eventually(t, check.FleetAgentStatus(t, kibClient, policy.ID, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online") // wait for version require.Eventually(t, func() bool { t.Log("Getting Agent version...") - newVersion, err := tools.GetAgentVersion(kibClient, policy.ID) + newVersion, err := fleettools.GetAgentVersion(kibClient, policy.ID) if err != nil { t.Logf("error getting agent version: %v", err) return false From 160b034eacb718bfbbf441d6c6f6d00a29ec4afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paolo=20Chil=C3=A0?= Date: Thu, 5 Oct 2023 17:00:35 +0200 Subject: [PATCH 2/5] Move integration test to ess staging (#3535) * move default ESS env to staging for integration tests * Remove region override from integration test buildkite step * change api key for staging * fixup! change api key for staging * fixup! fixup! change api key for staging --- .buildkite/hooks/pre-command | 5 +++-- .buildkite/scripts/steps/integration_tests.sh | 2 +- magefile.go | 4 ++-- pkg/testing/ess/client.go | 4 ++++ pkg/testing/ess/config.go | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command index 0fe9b7e3ae6..bb4db5f0948 100755 --- a/.buildkite/hooks/pre-command +++ b/.buildkite/hooks/pre-command @@ -17,7 +17,8 @@ DOCKER_REGISTRY="docker.elastic.co" DOCKER_REGISTRY_SECRET_PATH="kv/ci-shared/platform-ingest/docker_registry_prod" CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role" CI_GCP_OBS_PATH="kv/ci-shared/observability-ingest/cloud/gcp" -CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa" +# CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa" +CI_ESS_STAGING_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-staging" CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role" @@ -54,7 +55,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" == export TEST_INTEG_AUTH_GCP_SERVICE_TOKEN_FILE=$(realpath ./gcp.json) # ESS credentials - export API_KEY_TOKEN=$(vault kv get -field api_key ${CI_AGENT_QA_OBS_PATH}) + export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_STAGING_PATH}) echo ${API_KEY_TOKEN} > ./apiKey export TEST_INTEG_AUTH_ESS_APIKEY_FILE=$(realpath ./apiKey) fi diff --git a/.buildkite/scripts/steps/integration_tests.sh b/.buildkite/scripts/steps/integration_tests.sh index 9c469841e87..8b1dec76449 100755 --- a/.buildkite/scripts/steps/integration_tests.sh +++ b/.buildkite/scripts/steps/integration_tests.sh @@ -18,7 +18,7 @@ AGENT_PACKAGE_VERSION="${OVERRIDE_AGENT_PACKAGE_VERSION}" DEV=true EXTERNAL=true # Run integration tests set +e -AGENT_VERSION="${OVERRIDE_TEST_AGENT_VERSION}" TEST_INTEG_AUTH_ESS_REGION=azure-eastus2 TEST_INTEG_CLEAN_ON_EXIT=true SNAPSHOT=true mage integration:test +AGENT_VERSION="${OVERRIDE_TEST_AGENT_VERSION}" TEST_INTEG_CLEAN_ON_EXIT=true SNAPSHOT=true mage integration:test TESTS_EXIT_STATUS=$? set -e diff --git a/magefile.go b/magefile.go index 06016bd48ff..183889b3e69 100644 --- a/magefile.go +++ b/magefile.go @@ -2094,8 +2094,8 @@ func authESS(ctx context.Context) error { fmt.Fprintln(os.Stderr, "❌ ESS authentication unsuccessful. Retrying...") - prompt := "Please provide a ESS (QA) API key. To get your API key, " + - "visit https://console.qa.cld.elstc.co/deployment-features/keys:" + prompt := fmt.Sprintf("Please provide a ESS API key for %s. To get your API key, "+ + "visit %s/deployment-features/keys:", client.BaseURL(), strings.TrimRight(client.BaseURL(), "/api/v1")) essAPIKey, err = stringPrompt(prompt) if err != nil { return fmt.Errorf("unable to read ESS API key from prompt: %w", err) diff --git a/pkg/testing/ess/client.go b/pkg/testing/ess/client.go index 42587c6f275..0b618ff586e 100644 --- a/pkg/testing/ess/client.go +++ b/pkg/testing/ess/client.go @@ -60,3 +60,7 @@ func (c *Client) doPost(ctx context.Context, relativeUrl, contentType string, bo return c.client.Do(req) } + +func (c *Client) BaseURL() string { + return c.config.BaseUrl +} diff --git a/pkg/testing/ess/config.go b/pkg/testing/ess/config.go index 89298dbd341..c90be94caa5 100644 --- a/pkg/testing/ess/config.go +++ b/pkg/testing/ess/config.go @@ -18,7 +18,7 @@ type Config struct { func defaultConfig() *Config { return &Config{ - BaseUrl: `https://console.qa.cld.elstc.co/api/v1`, + BaseUrl: `https://staging.found.no/api/v1`, } } From ac2ef57b335ebca28216240508bd773d53009c7a Mon Sep 17 00:00:00 2001 From: Lee E Hinman <57081003+leehinman@users.noreply.github.com> Date: Thu, 5 Oct 2023 10:01:40 -0500 Subject: [PATCH 3/5] make delayed progress tests more resilient (#3526) allow progress tests to take more time than expected for CI --- internal/pkg/agent/install/progress_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/pkg/agent/install/progress_test.go b/internal/pkg/agent/install/progress_test.go index e74f7ff5666..2bb9bb6b1d3 100644 --- a/internal/pkg/agent/install/progress_test.go +++ b/internal/pkg/agent/install/progress_test.go @@ -56,7 +56,7 @@ func TestProgress(t *testing.T) { rs.Failed() - require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3}\.+ FAILED\n`), string(w.buf)) + require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3,}\.+ FAILED\n`), string(w.buf)) }) t.Run("multi_step_immediate_success", func(t *testing.T) { @@ -93,7 +93,7 @@ func TestProgress(t *testing.T) { rs.Succeeded() - require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3}\.+ DONE\nstep 2 starting\.{3}\.+ DONE`), string(w.buf)) + require.Regexp(t, regexp.MustCompile(`step 1 starting\.{3,}\.+ DONE\nstep 2 starting\.{3,}\.+ DONE`), string(w.buf)) }) t.Run("single_step_delay_after_failed", func(t *testing.T) { @@ -110,7 +110,7 @@ func TestProgress(t *testing.T) { rs.Failed() - require.Equal(t, "step 1 starting... FAILED\n", string(w.buf)) + require.Regexp(t, regexp.MustCompile(`step 1 starting.{3,} FAILED\n`), string(w.buf)) }) @@ -133,6 +133,6 @@ func TestProgress(t *testing.T) { rs.Succeeded() - require.Regexp(t, regexp.MustCompile(`step starting\.{3}\n substep 1 starting\.{3}\.+ DONE\n substep 2 starting\.{3}\.+ DONE\n DONE\n`), string(w.buf)) + require.Regexp(t, regexp.MustCompile(`step starting\.{3,}\n substep 1 starting\.{3,}\.+ DONE\n substep 2 starting\.{3,}\.+ DONE\n DONE\n`), string(w.buf)) }) } From eaf4e9c7995258d424ab4608c6ad2f5b972a9696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paolo=20Chil=C3=A0?= Date: Fri, 6 Oct 2023 07:51:29 +0200 Subject: [PATCH 4/5] fix integration test docs (#3545) --- docs/test-framework-dev-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/test-framework-dev-guide.md b/docs/test-framework-dev-guide.md index 0821f1a5996..3e7ef9383b8 100644 --- a/docs/test-framework-dev-guide.md +++ b/docs/test-framework-dev-guide.md @@ -10,7 +10,7 @@ Go version should be at least the same than the one in [.go-version](https://git ### Configuration -ESS (QA) API Key to create on https://console.qa.cld.elstc.co/deployment-features/keys +ESS (staging) API Key to create on https://staging.found.no/account/keys Warning: if you never created a deployment on it, you won't have permission to get this key, so you will need to create one first. From 33c6934057e80f3a4a3d6f1060345262980e334d Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Fri, 6 Oct 2023 10:42:39 +0200 Subject: [PATCH 5/5] fix install/enroll cmd to fail when agent restart fails (#3207) * surface errors that might occur during enroll * fail install command if agent cannot be restarted * do not print success message if there was an enroll error. Print an error message and the error instead * add logs to show the different enroll attempts * add more context t errors * refactor internal/pkg/agent/install/perms_unix.go and add more context to errors restore main version * ignore agent restart error on enroll tests as there is no agent to be restarted --- ...-Surface-errors-during-Agent's-enroll.yaml | 32 +++++++++ internal/pkg/agent/cmd/enroll.go | 2 +- internal/pkg/agent/cmd/enroll_cmd.go | 62 +++++++++++----- internal/pkg/agent/cmd/enroll_cmd_test.go | 71 +++++++++++++------ internal/pkg/agent/cmd/install.go | 4 +- internal/pkg/agent/install/perms_unix.go | 32 +++++---- 6 files changed, 151 insertions(+), 52 deletions(-) create mode 100644 changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml new file mode 100644 index 00000000000..f8361f99433 --- /dev/null +++ b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Surface errors during Agent's enroll process, failing if any happens. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: install/enroll + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3207 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index 1bce5f7e547..adaa278f32f 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - var fixPermissions bool = fromInstall + fixPermissions := fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index b5992f10188..f43e1483a8a 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -172,7 +172,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates an new enrollment and accept a custom store. +// newEnrollCmdWithStore creates a new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -187,10 +187,11 @@ func newEnrollCmdWithStore( }, nil } -// Execute tries to enroll the agent into Fleet. +// Execute enrolls the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what + span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterwards requests will be sent to :8221 + // however when the agent is running afterward requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return errors.New(err, "fail to enroll") + return fmt.Errorf("fail to enroll: %w", err) } if c.options.FixPermissions { @@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + if err != nil { + fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) + } else { + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") + } }() if c.agentProc == nil { - if err := c.daemonReload(ctx); err != nil { - c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) - } else { - c.log.Info("Successfully triggered restart on running Elastic Agent.") + if err = c.daemonReloadWithBackoff(ctx); err != nil { + c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) + return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) } + + c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } + c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -443,6 +450,11 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) + if err != nil && + (errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled)) { + return fmt.Errorf("could not reload deamon: %w", err) + } if err == nil { return nil } @@ -450,17 +462,20 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) - for i := 5; i >= 0; i-- { + var i int + for ; i < 5; i++ { backExp.Wait() c.log.Info("Retrying to restart...") err = c.daemonReload(ctx) - if err == nil { + if err == nil || + errors.Is(err, context.DeadlineExceeded) || + errors.Is(err, context.Canceled) { break } } close(signal) - return err + return fmt.Errorf("could not reload deamon after %d retries: %w", i+1, err) } func (c *enrollCmd) daemonReload(ctx context.Context) error { @@ -478,8 +493,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) + if err == nil { + return nil + } + + const deadline = 10 * time.Minute + const frequency = 60 * time.Second + + c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", + deadline, + frequency, + c.client.URI()) signal := make(chan struct{}) - backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) + defer close(signal) + backExp := backoff.NewExpBackoff(signal, frequency, deadline) for { retry := false @@ -498,7 +525,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } - close(signal) return err } @@ -547,8 +573,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return err + return fmt.Errorf( + "failed creating fleet-server bootstrap config: %w", err) } + // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -568,11 +596,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return err + return fmt.Errorf("yamlToReader failed: %w", err) } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return err + return fmt.Errorf("failed to store agent config: %w", err) } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 189ad7b6563..426924273d1 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -16,8 +16,11 @@ import ( "os" "runtime" "strconv" + "strings" "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -159,14 +162,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -216,16 +229,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.NoError(t, err) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) @@ -275,16 +296,24 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - err = cmd.Execute(context.Background(), streams) - require.NoError(t, err) - - require.True(t, store.Called) - + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + err = cmd.Execute(ctx, streams) + + if err != nil && + // There is no agent running, therefore nothing to be restarted. + // However, this will cause the Enroll command to return an error + // which we'll ignore here. + !strings.Contains(err.Error(), + "could not reload agent daemon, unable to trigger restart") { + t.Fatalf("enrrol coms returned and unexpected error: %v", err) + } + + assert.True(t, store.Called) config, err := readConfig(store.Content) - require.NoError(t, err) - require.Equal(t, "my-access-api-key", config.AccessAPIKey) - require.Equal(t, host, config.Client.Host) + assert.Equal(t, "my-access-api-key", config.AccessAPIKey) + assert.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index 4fbd37f40da..2cb46bd599d 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") + fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") return nil } } @@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } + + fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index e84dcd5039c..fc357fd4fde 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,6 +8,7 @@ package install import ( "errors" + "fmt" "io/fs" "os" "path/filepath" @@ -18,19 +19,26 @@ func fixPermissions(topPath string) error { return recursiveRootPermissions(topPath) } -func recursiveRootPermissions(path string) error { - return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { - if err == nil { - // all files should be owned by root:root - err = os.Chown(name, 0, 0) - if err != nil { - return err - } - // remove any world permissions from the file - err = os.Chmod(name, info.Mode().Perm()&0770) - } else if errors.Is(err, fs.ErrNotExist) { +func recursiveRootPermissions(root string) error { + return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error { + if errors.Is(err, fs.ErrNotExist) { return nil } - return err + if err != nil { + return fmt.Errorf("walk on %q failed: %w", path, err) + } + + // all files should be owned by root:root + err = os.Chown(path, 0, 0) + if err != nil { + return fmt.Errorf("could not fix ownership of %q: %w", path, err) + } + // remove any world permissions from the file + err = os.Chmod(path, info.Mode().Perm()&0770) + if err != nil { + return fmt.Errorf("could not fix permissions of %q: %w", path, err) + } + + return nil }) }