From d56730472e76cf38fda1d368c5b20253a814878f Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 13:24:43 +0000 Subject: [PATCH] Fix `TestInstallAndCLIUninstallWithEndpointSecurity` integration test flakiness (#3410) (#3437) * Return error if more than one Agent with same hostname is found in Fleet * Fix make call * Un-enroll Agent on test cleanup * Use policy ID and hostname to uniquely find Agent * Passing policy ID in more calls * Fix implicit memory aliasing * [Testing] Increasing timeout for status check to 10m * Get Agent by ID * Revert "Get Agent by ID" This reverts commit 1c7da6e7f646d0151cf091ff467915e19e3d7abc. (cherry picked from commit b19e95069e28d0389e2e66a8281e5df823566ecc) Co-authored-by: Shaunak Kashyap --- pkg/testing/tools/agents.go | 43 ++++++++++++------- pkg/testing/tools/tools.go | 8 ++-- testing/integration/endpoint_security_test.go | 8 +++- testing/integration/fqdn_test.go | 36 ++++++++-------- testing/integration/monitoring_logs_test.go | 2 +- testing/integration/upgrade_test.go | 10 ++--- 6 files changed, 62 insertions(+), 45 deletions(-) diff --git a/pkg/testing/tools/agents.go b/pkg/testing/tools/agents.go index 1bb6eda9cd4..f7ca5c33270 100644 --- a/pkg/testing/tools/agents.go +++ b/pkg/testing/tools/agents.go @@ -19,30 +19,41 @@ import ( "github.com/elastic/elastic-agent/pkg/control/v2/cproto" ) -// GetAgentByHostnameFromList get an agent by the local_metadata.host.name property, reading from the agents list -func GetAgentByHostnameFromList(client *kibana.Client, hostname string) (*kibana.AgentExisting, error) { +// GetAgentByPolicyIDAndHostnameFromList get an agent by the local_metadata.host.name property, reading from the agents list +func GetAgentByPolicyIDAndHostnameFromList(client *kibana.Client, policyID, hostname string) (*kibana.AgentExisting, error) { listAgentsResp, err := client.ListAgents(context.Background(), kibana.ListAgentsRequest{}) if err != nil { return nil, err } - for _, item := range listAgentsResp.Items { + hostnameAgents := make([]*kibana.AgentExisting, 0) + for i, item := range listAgentsResp.Items { agentHostname := item.LocalMetadata.Host.Hostname - if agentHostname == hostname { - return &item, nil + agentPolicyID := item.PolicyID + + if agentHostname == hostname && agentPolicyID == policyID { + hostnameAgents = append(hostnameAgents, &listAgentsResp.Items[i]) } } - return nil, fmt.Errorf("unable to find agent with hostname [%s]", hostname) + if len(hostnameAgents) == 0 { + return nil, fmt.Errorf("unable to find agent with hostname [%s]", hostname) + } + + if len(hostnameAgents) > 1 { + return nil, fmt.Errorf("found %d agents with hostname [%s]; expected to find only one", len(hostnameAgents), hostname) + } + + return hostnameAgents[0], nil } -func GetAgentStatus(client *kibana.Client) (string, error) { +func GetAgentStatus(client *kibana.Client, policyID string) (string, error) { hostname, err := os.Hostname() if err != nil { return "", err } - agent, err := GetAgentByHostnameFromList(client, hostname) + agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname) if err != nil { return "", err } @@ -50,13 +61,13 @@ func GetAgentStatus(client *kibana.Client) (string, error) { return agent.Status, nil } -func GetAgentVersion(client *kibana.Client) (string, error) { +func GetAgentVersion(client *kibana.Client, policyID string) (string, error) { hostname, err := os.Hostname() if err != nil { return "", err } - agent, err := GetAgentByHostnameFromList(client, hostname) + agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname) if err != nil { return "", err } @@ -64,12 +75,12 @@ func GetAgentVersion(client *kibana.Client) (string, error) { return agent.Agent.Version, err } -func UnEnrollAgent(client *kibana.Client) error { +func UnEnrollAgent(client *kibana.Client, policyID string) error { hostname, err := os.Hostname() if err != nil { return err } - agentID, err := GetAgentIDByHostname(client, hostname) + agentID, err := GetAgentIDByHostname(client, policyID, hostname) if err != nil { return err } @@ -86,20 +97,20 @@ func UnEnrollAgent(client *kibana.Client) error { return nil } -func GetAgentIDByHostname(client *kibana.Client, hostname string) (string, error) { - agent, err := GetAgentByHostnameFromList(client, hostname) +func GetAgentIDByHostname(client *kibana.Client, policyID, hostname string) (string, error) { + agent, err := GetAgentByPolicyIDAndHostnameFromList(client, policyID, hostname) if err != nil { return "", err } return agent.Agent.ID, nil } -func UpgradeAgent(client *kibana.Client, version string) error { +func UpgradeAgent(client *kibana.Client, policyID, version string) error { hostname, err := os.Hostname() if err != nil { return err } - agentID, err := GetAgentIDByHostname(client, hostname) + agentID, err := GetAgentIDByHostname(client, policyID, hostname) if err != nil { return err } diff --git a/pkg/testing/tools/tools.go b/pkg/testing/tools/tools.go index 7b125293eb6..4ac8c930e66 100644 --- a/pkg/testing/tools/tools.go +++ b/pkg/testing/tools/tools.go @@ -19,9 +19,9 @@ import ( // WaitForAgentStatus returns a niladic function that returns true if the agent // has reached expectedStatus; false otherwise. The returned function is intended // for use with assert.Eventually or require.Eventually. -func WaitForAgentStatus(t *testing.T, client *kibana.Client, expectedStatus string) func() bool { +func WaitForAgentStatus(t *testing.T, client *kibana.Client, policyID string, expectedStatus string) func() bool { return func() bool { - currentStatus, err := GetAgentStatus(client) + currentStatus, err := GetAgentStatus(client, policyID) if err != nil { t.Errorf("unable to determine agent status: %s", err.Error()) return false @@ -128,14 +128,14 @@ func InstallAgentForPolicy(t *testing.T, ctx context.Context, } t.Logf(">>> Ran Enroll. Output: %s", output) - timeout := 5 * time.Minute + timeout := 10 * time.Minute if deadline, ok := ctx.Deadline(); ok { timeout = time.Until(deadline) } // Wait for Agent to be healthy require.Eventually( t, - WaitForAgentStatus(t, kibClient, "online"), + WaitForAgentStatus(t, kibClient, policyID, "online"), timeout, 10*time.Second, "Elastic Agent status is not online", diff --git a/testing/integration/endpoint_security_test.go b/testing/integration/endpoint_security_test.go index bb77f724e52..9304901047f 100644 --- a/testing/integration/endpoint_security_test.go +++ b/testing/integration/endpoint_security_test.go @@ -19,6 +19,7 @@ import ( "text/template" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/google/uuid" @@ -135,6 +136,11 @@ func testInstallAndCLIUninstallWithEndpointSecurity(t *testing.T, info *define.I installOpts, fixture, info.KibanaClient, createPolicyReq) require.NoError(t, err, "failed to install agent with policy") + t.Cleanup(func() { + t.Log("Un-enrolling Elastic Agent...") + assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + }) + t.Log("Installing Elastic Defend") pkgPolicyResp, err := installElasticDefendPackage(t, info, policy.ID) require.NoErrorf(t, err, "Policy Response was: %v", pkgPolicyResp) @@ -238,7 +244,7 @@ func testInstallAndUnenrollWithEndpointSecurity(t *testing.T, info *define.Info, hostname, err := os.Hostname() require.NoError(t, err) - agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, hostname) + agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) require.NoError(t, err) _, err = info.KibanaClient.UnEnrollAgent(ctx, kibana.UnEnrollAgentRequest{ID: agentID}) diff --git a/testing/integration/fqdn_test.go b/testing/integration/fqdn_test.go index b280f1e6e5b..2fb76a6028c 100644 --- a/testing/integration/fqdn_test.go +++ b/testing/integration/fqdn_test.go @@ -56,19 +56,6 @@ func TestFQDN(t *testing.T) { origHostname, err := getHostname(context.Background()) require.NoError(t, err) - t.Cleanup(func() { - t.Log("Un-enrolling Elastic Agent...") - assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient)) - - t.Log("Restoring hostname...") - err := setHostname(context.Background(), origHostname, t.Log) - require.NoError(t, err) - - t.Log("Restoring original /etc/hosts...") - err = setEtcHosts(origEtcHosts) - require.NoError(t, err) - }) - ctx := context.Background() kibClient := info.KibanaClient @@ -101,8 +88,21 @@ func TestFQDN(t *testing.T) { policy, err := tools.InstallAgentWithPolicy(t, ctx, installOpts, agentFixture, kibClient, createPolicyReq) require.NoError(t, err) + t.Cleanup(func() { + t.Log("Un-enrolling Elastic Agent...") + assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + + t.Log("Restoring hostname...") + err := setHostname(context.Background(), origHostname, t.Log) + require.NoError(t, err) + + t.Log("Restoring original /etc/hosts...") + err = setEtcHosts(origEtcHosts) + require.NoError(t, err) + }) + t.Log("Verify that agent name is short hostname") - agent := verifyAgentName(t, shortName, info.KibanaClient) + agent := verifyAgentName(t, policy.ID, shortName, info.KibanaClient) t.Log("Verify that hostname in `logs-*` and `metrics-*` is short hostname") verifyHostNameInIndices(t, "logs-*", shortName, info.Namespace, info.ESClient) @@ -133,7 +133,7 @@ func TestFQDN(t *testing.T) { ) t.Log("Verify that agent name is FQDN") - verifyAgentName(t, fqdn, info.KibanaClient) + verifyAgentName(t, policy.ID, fqdn, info.KibanaClient) t.Log("Verify that hostname in `logs-*` and `metrics-*` is FQDN") verifyHostNameInIndices(t, "logs-*", fqdn, info.Namespace, info.ESClient) @@ -164,7 +164,7 @@ func TestFQDN(t *testing.T) { ) t.Log("Verify that agent name is short hostname again") - verifyAgentName(t, shortName, info.KibanaClient) + verifyAgentName(t, policy.ID, shortName, info.KibanaClient) // TODO: Re-enable assertion once https://github.com/elastic/elastic-agent/issues/3078 is // investigated for root cause and resolved. @@ -173,7 +173,7 @@ func TestFQDN(t *testing.T) { // verifyHostNameInIndices(t, "metrics-*", shortName, info.ESClient) } -func verifyAgentName(t *testing.T, hostname string, kibClient *kibana.Client) *kibana.AgentExisting { +func verifyAgentName(t *testing.T, policyID, hostname string, kibClient *kibana.Client) *kibana.AgentExisting { t.Helper() var agent *kibana.AgentExisting @@ -182,7 +182,7 @@ func verifyAgentName(t *testing.T, hostname string, kibClient *kibana.Client) *k require.Eventually( t, func() bool { - agent, err = tools.GetAgentByHostnameFromList(kibClient, hostname) + agent, err = tools.GetAgentByPolicyIDAndHostnameFromList(kibClient, policyID, hostname) return err == nil && agent != nil }, 5*time.Minute, diff --git a/testing/integration/monitoring_logs_test.go b/testing/integration/monitoring_logs_test.go index 368462ed92b..607fe0c9b8d 100644 --- a/testing/integration/monitoring_logs_test.go +++ b/testing/integration/monitoring_logs_test.go @@ -114,7 +114,7 @@ func TestMonitoringLogsShipped(t *testing.T) { t.Fatalf("could not get hostname to filter Agent: %s", err) } - agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, hostname) + agentID, err := tools.GetAgentIDByHostname(info.KibanaClient, policy.ID, hostname) require.NoError(t, err, "could not get Agent ID by hostname") t.Logf("Agent ID: %q", agentID) diff --git a/testing/integration/upgrade_test.go b/testing/integration/upgrade_test.go index 79c6f3658c8..798b842b216 100644 --- a/testing/integration/upgrade_test.go +++ b/testing/integration/upgrade_test.go @@ -158,25 +158,25 @@ func testUpgradeFleetManagedElasticAgent(t *testing.T, ctx context.Context, info require.NoError(t, err) t.Cleanup(func() { t.Log("Un-enrolling Elastic Agent...") - assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient)) + assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) }) t.Log(`Waiting for enrolled Agent status to be "online"...`) - require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online") + require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 2*time.Minute, 10*time.Second, "Agent status is not online") t.Logf("Upgrade Elastic Agent to version %s...", toVersion) - err = tools.UpgradeAgent(kibClient, toVersion) + err = tools.UpgradeAgent(kibClient, policy.ID, toVersion) require.NoError(t, err) t.Log(`Waiting for enrolled Agent status to be "online"...`) - require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online") + require.Eventually(t, tools.WaitForAgentStatus(t, kibClient, policy.ID, "online"), 10*time.Minute, 15*time.Second, "Agent status is not online") // We remove the `-SNAPSHOT` suffix because, post-upgrade, the version reported // by the Agent will not contain this suffix, even if a `-SNAPSHOT`-suffixed // version was used as the target version for the upgrade. require.Eventually(t, func() bool { t.Log("Getting Agent version...") - newVersion, err := tools.GetAgentVersion(kibClient) + newVersion, err := tools.GetAgentVersion(kibClient, policy.ID) if err != nil { t.Logf("error getting agent version: %v", err) return false