From 697a1bc677e7939d606743ae141493a7e7406ab0 Mon Sep 17 00:00:00 2001
From: Davide Girardi <1390902+girodav@users.noreply.github.com>
Date: Tue, 14 Nov 2023 08:26:35 +0000
Subject: [PATCH 1/3] Remove assetbeat as dependency (#3739)

---
 dev-tools/mage/manifest/manifest.go | 1 -
 magefile.go                         | 2 --
 2 files changed, 3 deletions(-)

diff --git a/dev-tools/mage/manifest/manifest.go b/dev-tools/mage/manifest/manifest.go
index a2c377d600c..62f82eb2c87 100644
--- a/dev-tools/mage/manifest/manifest.go
+++ b/dev-tools/mage/manifest/manifest.go
@@ -87,7 +87,6 @@ func DownloadComponentsFromManifest(manifest string, platforms []string, platfor
 		"beats":                 {"auditbeat", "filebeat", "heartbeat", "metricbeat", "osquerybeat", "packetbeat"},
 		"cloud-defend":          {"cloud-defend"},
 		"cloudbeat":             {"cloudbeat"},
-		"assetbeat":             {"assetbeat"},
 		"elastic-agent-shipper": {"elastic-agent-shipper"},
 		"endpoint-dev":          {"endpoint-security"},
 		"fleet-server":          {"fleet-server"},
diff --git a/magefile.go b/magefile.go
index 23394850ee3..892b5d65560 100644
--- a/magefile.go
+++ b/magefile.go
@@ -929,7 +929,6 @@ func packageAgent(platforms []string, packagingFn func()) {
 			// https://artifacts-snapshot.elastic.co/endpoint-dev/latest/8.11.0-SNAPSHOT.json
 			// https://artifacts-snapshot.elastic.co/fleet-server/latest/8.11.0-SNAPSHOT.json
 			// https://artifacts-snapshot.elastic.co/prodfiler/latest/8.11.0-SNAPSHOT.json
-			// https://artifacts-snapshot.elastic.co/assetbeat/latest/8.11.0-SNAPSHOT.json
 			externalBinaries := map[string]string{
 				"auditbeat":             "beats",
 				"filebeat":              "beats",
@@ -945,7 +944,6 @@ func packageAgent(platforms []string, packagingFn func()) {
 				"pf-elastic-collector":  "prodfiler",
 				"pf-elastic-symbolizer": "prodfiler",
 				"pf-host-agent":         "prodfiler",
-				"assetbeat":             "assetbeat", // only supporting linux/amd64 or linux/arm64
 			}
 
 			// Only log fatal logs for logs produced using logrus. This is the global logger

From b272a93bcf47105daa424e5ee8139c12c3234369 Mon Sep 17 00:00:00 2001
From: Blake Rouse <blake.rouse@elastic.co>
Date: Tue, 14 Nov 2023 05:04:18 -0500
Subject: [PATCH 2/3] Switch to CFT region and add more robust tracking and
 cleanup of stacks. (#3701)

* Switch to CFT region and add more robust tracking and cleanup of stacks.

* Fix tests.

* Adjust integration tests.

* Fix lint in servless provider.

* Fix serverless.

* Fix comment and typo.

* Fix serverless.

* More serverless fixes.

* Fix check loop in serverless.

* Fix lint.

* Code review fixes.

* Fix a few missed error messages.
---
 .buildkite/hooks/pre-command            |   5 +-
 .buildkite/hooks/pre-exit               |   6 +-
 .buildkite/pipeline.yml                 |   4 -
 magefile.go                             |   9 +-
 pkg/testing/ess/config.go               |   7 +-
 pkg/testing/ess/provisioner.go          | 149 +++++++--------
 pkg/testing/ess/serverless.go           |   2 +-
 pkg/testing/ess/serverless_provision.go | 236 +++++++++++++-----------
 pkg/testing/ess/serverless_test.go      |  29 ++-
 pkg/testing/runner/provisioner.go       |  16 +-
 pkg/testing/runner/runner.go            | 185 +++++++++++++------
 pkg/testing/runner/runner_test.go       |  48 +++--
 12 files changed, 386 insertions(+), 310 deletions(-)

diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command
index 292aa6918c0..c8a44505148 100755
--- a/.buildkite/hooks/pre-command
+++ b/.buildkite/hooks/pre-command
@@ -17,8 +17,7 @@ DOCKER_REGISTRY="docker.elastic.co"
 DOCKER_REGISTRY_SECRET_PATH="kv/ci-shared/platform-ingest/docker_registry_prod"
 CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
 CI_GCP_OBS_PATH="kv/ci-shared/observability-ingest/cloud/gcp"
-# CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa"
-CI_ESS_STAGING_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-staging"
+CI_ESS_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-prod"
 CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
 
 
@@ -55,7 +54,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" ==
   export TEST_INTEG_AUTH_GCP_SERVICE_TOKEN_FILE=$(realpath ./gcp.json)
 
   # ESS credentials
-  export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_STAGING_PATH})
+  export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_PATH})
   echo ${API_KEY_TOKEN} > ./apiKey
   export TEST_INTEG_AUTH_ESS_APIKEY_FILE=$(realpath ./apiKey)
 fi
diff --git a/.buildkite/hooks/pre-exit b/.buildkite/hooks/pre-exit
index 4d0da50cf73..213f51aff7b 100755
--- a/.buildkite/hooks/pre-exit
+++ b/.buildkite/hooks/pre-exit
@@ -10,7 +10,11 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" ==
 
   # Perform cleanup of integration tests resources
   echo "--- Cleaning up integration test resources"
-  TEST_INTEG_AUTH_ESS_REGION=us-east-1 SNAPSHOT=true mage integration:clean
+  if [[ "$BUILDKITE_STEP_KEY" == "serverless-integration-tests" ]]; then
+    STACK_PROVISIONER=serverless SNAPSHOT=true mage integration:clean
+  else
+    SNAPSHOT=true mage integration:clean
+  fi
 fi
 
 if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ]; then
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 489a3202ee3..866dc91b367 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -201,8 +201,6 @@ steps:
 
   - label: "Serverless integration test"
     key: "serverless-integration-tests"
-    env:
-      TEST_INTEG_AUTH_ESS_REGION: us-east-1
     command: ".buildkite/scripts/steps/integration_tests.sh serverless integration:single TestLogIngestionFleetManaged" #right now, run a single test in serverless mode as a sort of smoke test, instead of re-running the entire suite
     artifact_paths:
       - "build/TEST-**"
@@ -213,8 +211,6 @@ steps:
 
   - label: "Integration tests"
     key: "integration-tests"
-    env:
-      TEST_INTEG_AUTH_ESS_REGION: us-east-1
     command: ".buildkite/scripts/steps/integration_tests.sh stateful"
     artifact_paths:
       - "build/TEST-**"
diff --git a/magefile.go b/magefile.go
index 892b5d65560..c6a73d147c3 100644
--- a/magefile.go
+++ b/magefile.go
@@ -1750,15 +1750,16 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche
 	}
 	datacenter := os.Getenv("TEST_INTEG_AUTH_GCP_DATACENTER")
 	if datacenter == "" {
+		// us-central1-a is used because T2A instances required for ARM64 testing are only
+		// available in the central regions
 		datacenter = "us-central1-a"
 	}
 
-	// Valid values are gcp-us-central1 (default), azure-eastus2,
-	// aws-eu-central-1, us-east-1 (which is an AWS region but the
-	// "aws" CSP prefix is not used by ESS for some reason!)
+	// Possible to change the region for deployment, default is gcp-us-west2 which is
+	// the CFT region.
 	essRegion := os.Getenv("TEST_INTEG_AUTH_ESS_REGION")
 	if essRegion == "" {
-		essRegion = "gcp-us-central1"
+		essRegion = "gcp-us-west2"
 	}
 
 	instanceProvisionerMode := os.Getenv("INSTANCE_PROVISIONER")
diff --git a/pkg/testing/ess/config.go b/pkg/testing/ess/config.go
index c90be94caa5..62ece1be1ef 100644
--- a/pkg/testing/ess/config.go
+++ b/pkg/testing/ess/config.go
@@ -17,8 +17,13 @@ type Config struct {
 }
 
 func defaultConfig() *Config {
+	baseURL := os.Getenv("TEST_INTEG_AUTH_ESS_URL")
+	if baseURL == "" {
+		baseURL = "https://cloud.elastic.co"
+	}
+	url := strings.TrimRight(baseURL, "/") + "/api/v1"
 	return &Config{
-		BaseUrl: `https://staging.found.no/api/v1`,
+		BaseUrl: url,
 	}
 }
 
diff --git a/pkg/testing/ess/provisioner.go b/pkg/testing/ess/provisioner.go
index a051cac39d1..47e8d9dcba2 100644
--- a/pkg/testing/ess/provisioner.go
+++ b/pkg/testing/ess/provisioner.go
@@ -11,8 +11,6 @@ import (
 	"strings"
 	"time"
 
-	"golang.org/x/sync/errgroup"
-
 	"github.com/elastic/elastic-agent/pkg/testing/runner"
 )
 
@@ -62,89 +60,77 @@ func (p *provisioner) SetLogger(l runner.Logger) {
 	p.logger = l
 }
 
-func (p *provisioner) Provision(ctx context.Context, requests []runner.StackRequest) ([]runner.Stack, error) {
-	results := make(map[runner.StackRequest]*CreateDeploymentResponse)
-	for _, r := range requests {
-		// allow up to 2 minutes for each create request
-		createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute)
-		resp, err := p.createDeployment(createCtx, r,
-			map[string]string{
-				"division":          "engineering",
-				"org":               "ingest",
-				"team":              "elastic-agent",
-				"project":           "elastic-agent",
-				"integration-tests": "true",
-			})
-		createCancel()
-		if err != nil {
-			return nil, err
-		}
-		results[r] = resp
-	}
+// Create creates a stack.
+func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
+	// allow up to 2 minutes for request
+	createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute)
+	defer createCancel()
+	resp, err := p.createDeployment(createCtx, request,
+		map[string]string{
+			"division":          "engineering",
+			"org":               "ingest",
+			"team":              "elastic-agent",
+			"project":           "elastic-agent",
+			"integration-tests": "true",
+		})
+	if err != nil {
+		return runner.Stack{}, err
+	}
+	return runner.Stack{
+		ID:            request.ID,
+		Version:       request.Version,
+		Elasticsearch: resp.ElasticsearchEndpoint,
+		Kibana:        resp.KibanaEndpoint,
+		Username:      resp.Username,
+		Password:      resp.Password,
+		Internal: map[string]interface{}{
+			"deployment_id": resp.ID,
+		},
+		Ready: false,
+	}, nil
+}
 
-	// set a long timeout
-	// this context travels up to the magefile, clients that want a shorter timeout can set
-	// it via mage's -t flag
-	readyCtx, readyCancel := context.WithTimeout(ctx, 25*time.Minute)
-	defer readyCancel()
-
-	g, gCtx := errgroup.WithContext(readyCtx)
-	for req, resp := range results {
-		g.Go(func(req runner.StackRequest, resp *CreateDeploymentResponse) func() error {
-			return func() error {
-				ready, err := p.client.DeploymentIsReady(gCtx, resp.ID, 30*time.Second)
-				if err != nil {
-					return fmt.Errorf("failed to check for cloud %s to be ready: %w", req.Version, err)
-				}
-				if !ready {
-					return fmt.Errorf("cloud %s never became ready: %w", req.Version, err)
-				}
-				return nil
-			}
-		}(req, resp))
+// WaitForReady should block until the stack is ready or the context is cancelled.
+func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
+	deploymentID, err := p.getDeploymentID(stack)
+	if err != nil {
+		return stack, fmt.Errorf("failed to get deployment ID from the stack: %w", err)
 	}
-	err := g.Wait()
+	// allow up to 10 minutes for it to become ready
+	ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
+	defer cancel()
+	p.logger.Logf("Waiting for cloud stack %s to be ready [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
+	ready, err := p.client.DeploymentIsReady(ctx, deploymentID, 30*time.Second)
 	if err != nil {
-		return nil, err
+		return stack, fmt.Errorf("failed to check for cloud %s [stack_id: %s, deployment_id: %s] to be ready: %w", stack.Version, stack.ID, deploymentID, err)
 	}
-
-	var stacks []runner.Stack
-	for req, resp := range results {
-		stacks = append(stacks, runner.Stack{
-			ID:            req.ID,
-			Version:       req.Version,
-			Elasticsearch: resp.ElasticsearchEndpoint,
-			Kibana:        resp.KibanaEndpoint,
-			Username:      resp.Username,
-			Password:      resp.Password,
-			Internal: map[string]interface{}{
-				"deployment_id": resp.ID,
-			},
-		})
+	if !ready {
+		return stack, fmt.Errorf("cloud %s [stack_id: %s, deployment_id: %s] never became ready: %w", stack.Version, stack.ID, deploymentID, err)
 	}
-	return stacks, nil
+	stack.Ready = true
+	return stack, nil
 }
 
-// Clean cleans up all provisioned resources.
-func (p *provisioner) Clean(ctx context.Context, stacks []runner.Stack) error {
-	var errs []error
-	for _, s := range stacks {
-		err := p.destroyDeployment(ctx, s)
-		if err != nil {
-			errs = append(errs, fmt.Errorf("failed to destroy stack %s (%s): %w", s.Version, s.ID, err))
-		}
-	}
-	if len(errs) > 0 {
-		return errors.Join(errs...)
+// Delete deletes a stack.
+func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error {
+	deploymentID, err := p.getDeploymentID(stack)
+	if err != nil {
+		return err
 	}
-	return nil
+
+	// allow up to 1 minute for request
+	ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
+	defer cancel()
+
+	p.logger.Logf("Destroying cloud stack %s [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
+	return p.client.ShutdownDeployment(ctx, deploymentID)
 }
 
 func (p *provisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) {
 	ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
 	defer cancel()
 
-	p.logger.Logf("Creating stack %s (%s)", r.Version, r.ID)
+	p.logger.Logf("Creating cloud stack %s [stack_id: %s]", r.Version, r.ID)
 	name := fmt.Sprintf("%s-%s", strings.Replace(p.cfg.Identifier, ".", "-", -1), r.ID)
 
 	// prepare tags
@@ -168,26 +154,21 @@ func (p *provisioner) createDeployment(ctx context.Context, r runner.StackReques
 		p.logger.Logf("Failed to create ESS cloud %s: %s", r.Version, err)
 		return nil, fmt.Errorf("failed to create ESS cloud for version %s: %w", r.Version, err)
 	}
-	p.logger.Logf("Created stack %s (%s) [id: %s]", r.Version, r.ID, resp.ID)
+	p.logger.Logf("Created cloud stack %s [stack_id: %s, deployment_id: %s]", r.Version, r.ID, resp.ID)
 	return resp, nil
 }
 
-func (p *provisioner) destroyDeployment(ctx context.Context, s runner.Stack) error {
-	if s.Internal == nil {
-		return fmt.Errorf("missing internal information")
+func (p *provisioner) getDeploymentID(stack runner.Stack) (string, error) {
+	if stack.Internal == nil {
+		return "", fmt.Errorf("missing internal information")
 	}
-	deploymentIDRaw, ok := s.Internal["deployment_id"]
+	deploymentIDRaw, ok := stack.Internal["deployment_id"]
 	if !ok {
-		return fmt.Errorf("missing internal deployment_id")
+		return "", fmt.Errorf("missing internal deployment_id")
 	}
 	deploymentID, ok := deploymentIDRaw.(string)
 	if !ok {
-		return fmt.Errorf("internal deployment_id not a string")
+		return "", fmt.Errorf("internal deployment_id not a string")
 	}
-
-	ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
-	defer cancel()
-
-	p.logger.Logf("Destroying stack %s (%s)", s.Version, s.ID)
-	return p.client.ShutdownDeployment(ctx, deploymentID)
+	return deploymentID, nil
 }
diff --git a/pkg/testing/ess/serverless.go b/pkg/testing/ess/serverless.go
index 817ee33f03b..df1129e8e97 100644
--- a/pkg/testing/ess/serverless.go
+++ b/pkg/testing/ess/serverless.go
@@ -17,7 +17,7 @@ import (
 	"github.com/elastic/elastic-agent/pkg/testing/runner"
 )
 
-var serverlessURL = "https://staging.found.no"
+var serverlessURL = "https://cloud.elastic.co"
 
 // ServerlessClient is the handler the serverless ES instance
 type ServerlessClient struct {
diff --git a/pkg/testing/ess/serverless_provision.go b/pkg/testing/ess/serverless_provision.go
index 32ec8f8227b..c9656f628af 100644
--- a/pkg/testing/ess/serverless_provision.go
+++ b/pkg/testing/ess/serverless_provision.go
@@ -10,7 +10,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
-	"sync"
+	"time"
 
 	"github.com/elastic/elastic-agent-libs/logp"
 	"github.com/elastic/elastic-agent/pkg/testing/runner"
@@ -18,17 +18,15 @@ import (
 
 // ServerlessProvision contains
 type ServerlessProvision struct {
-	stacksMut sync.RWMutex
-	stacks    map[string]stackhandlerData
-	cfg       ProvisionerConfig
-	log       runner.Logger
+	cfg ProvisionerConfig
+	log runner.Logger
 }
 
 type defaultLogger struct {
 	wrapped *logp.Logger
 }
 
-// / implements the runner.Logger interface
+// Logf implements the runner.Logger interface
 func (log *defaultLogger) Logf(format string, args ...any) {
 	if len(args) == 0 {
 
@@ -38,12 +36,6 @@ func (log *defaultLogger) Logf(format string, args ...any) {
 
 }
 
-// tracks the data that maps to a single serverless deployment
-type stackhandlerData struct {
-	client    *ServerlessClient
-	stackData runner.Stack
-}
-
 // ServerlessRegions is the JSON response from the serverless regions API endpoint
 type ServerlessRegions struct {
 	CSP       string `json:"csp"`
@@ -55,9 +47,8 @@ type ServerlessRegions struct {
 // NewServerlessProvisioner creates a new StackProvisioner instance for serverless
 func NewServerlessProvisioner(cfg ProvisionerConfig) (runner.StackProvisioner, error) {
 	prov := &ServerlessProvision{
-		cfg:    cfg,
-		stacks: map[string]stackhandlerData{},
-		log:    &defaultLogger{wrapped: logp.L()},
+		cfg: cfg,
+		log: &defaultLogger{wrapped: logp.L()},
 	}
 	err := prov.CheckCloudRegion()
 	if err != nil {
@@ -71,114 +62,118 @@ func (prov *ServerlessProvision) SetLogger(l runner.Logger) {
 	prov.log = l
 }
 
-// Provision a new set of serverless instances
-func (prov *ServerlessProvision) Provision(ctx context.Context, requests []runner.StackRequest) ([]runner.Stack, error) {
-	upWaiter := sync.WaitGroup{}
-	depErrs := make(chan error, len(requests))
-	depUp := make(chan bool, len(requests))
-	stacks := []runner.Stack{}
-	for _, req := range requests {
-		client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log)
-		srvReq := ServerlessRequest{Name: req.ID, RegionID: prov.cfg.Region}
-		proj, err := client.DeployStack(ctx, srvReq)
-		if err != nil {
-			return nil, fmt.Errorf("error deploying stack for request %s: %w", req.ID, err)
-		}
-		err = client.WaitForEndpoints(ctx)
-		if err != nil {
-			return nil, fmt.Errorf("error waiting for endpoints to become available for request: %w", err)
-		}
-		newStack := runner.Stack{
-			ID:            req.ID,
-			Version:       req.Version,
-			Elasticsearch: client.proj.Endpoints.Elasticsearch,
-			Kibana:        client.proj.Endpoints.Kibana,
-			Username:      client.proj.Credentials.Username,
-			Password:      client.proj.Credentials.Password,
-			Internal: map[string]interface{}{
-				"deployment_id":   proj.ID,
-				"deployment_type": proj.Type,
-			},
-		}
-		stacks = append(stacks, newStack)
-		prov.stacksMut.Lock()
-		prov.stacks[req.ID] = stackhandlerData{client: client, stackData: newStack}
-		prov.stacksMut.Unlock()
+// Create creates a stack.
+func (prov *ServerlessProvision) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
+	// allow up to 4 minutes for requests
+	createCtx, createCancel := context.WithTimeout(ctx, 4*time.Minute)
+	defer createCancel()
 
-		upWaiter.Add(1)
-		go func() {
-			isUp, err := client.DeploymentIsReady(ctx)
-			if err != nil {
-				depErrs <- err
+	client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log)
+	srvReq := ServerlessRequest{Name: request.ID, RegionID: prov.cfg.Region}
 
-			}
-			depUp <- isUp
-		}()
+	prov.log.Logf("Creating serverless stack %s [stack_id: %s]", request.Version, request.ID)
+	proj, err := client.DeployStack(createCtx, srvReq)
+	if err != nil {
+		return runner.Stack{}, fmt.Errorf("error deploying stack for request %s: %w", request.ID, err)
+	}
+	err = client.WaitForEndpoints(createCtx)
+	if err != nil {
+		return runner.Stack{}, fmt.Errorf("error waiting for endpoints to become available for serverless stack %s [stack_id: %s, deployment_id: %s]: %w", request.Version, request.ID, proj.ID, err)
+	}
+	stack := runner.Stack{
+		ID:            request.ID,
+		Version:       request.Version,
+		Elasticsearch: client.proj.Endpoints.Elasticsearch,
+		Kibana:        client.proj.Endpoints.Kibana,
+		Username:      client.proj.Credentials.Username,
+		Password:      client.proj.Credentials.Password,
+		Internal: map[string]interface{}{
+			"deployment_id":   proj.ID,
+			"deployment_type": proj.Type,
+		},
+		Ready: false,
 	}
+	prov.log.Logf("Created serverless stack %s [stack_id: %s, deployment_id: %s]", request.Version, request.ID, proj.ID)
+	return stack, nil
+}
+
+// WaitForReady should block until the stack is ready or the context is cancelled.
+func (prov *ServerlessProvision) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
+	deploymentID, deploymentType, err := prov.getDeploymentInfo(stack)
+	if err != nil {
+		return stack, fmt.Errorf("failed to get deployment info from the stack: %w", err)
+	}
+
+	ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
+	defer cancel()
+
+	client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log)
+	client.proj.ID = deploymentID
+	client.proj.Type = deploymentType
+	client.proj.Region = prov.cfg.Region
+	client.proj.Endpoints.Elasticsearch = stack.Elasticsearch
+	client.proj.Endpoints.Kibana = stack.Kibana
+	client.proj.Credentials.Username = stack.Username
+	client.proj.Credentials.Password = stack.Password
+
+	prov.log.Logf("Waiting for serverless stack %s to be ready [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
+
+	errCh := make(chan error)
+	var lastErr error
+
+	ticker := time.NewTicker(30 * time.Second)
+	defer ticker.Stop()
 
-	gotUp := 0
 	for {
 		select {
 		case <-ctx.Done():
-			return nil, ctx.Err()
-		case err := <-depErrs:
-			return nil, fmt.Errorf("error waiting for stacks to become available: %w", err)
-		case isUp := <-depUp:
-			if isUp {
-				gotUp++
+			if lastErr == nil {
+				lastErr = ctx.Err()
 			}
-			if gotUp >= len(requests) {
-				return stacks, nil
+			return stack, fmt.Errorf("serverless stack %s [stack_id: %s, deployment_id: %s] never became ready: %w", stack.Version, stack.ID, deploymentID, lastErr)
+		case <-ticker.C:
+			go func() {
+				statusCtx, statusCancel := context.WithTimeout(ctx, 30*time.Second)
+				defer statusCancel()
+				ready, err := client.DeploymentIsReady(statusCtx)
+				if err != nil {
+					errCh <- err
+				} else if !ready {
+					errCh <- fmt.Errorf("serverless stack %s [stack_id: %s, deployment_id: %s] never became ready", stack.Version, stack.ID, deploymentID)
+				} else {
+					errCh <- nil
+				}
+			}()
+		case err := <-errCh:
+			if err == nil {
+				stack.Ready = true
+				return stack, nil
 			}
+			lastErr = err
 		}
 	}
-
 }
 
-// Clean shuts down and removes the deployments
-func (prov *ServerlessProvision) Clean(ctx context.Context, stacks []runner.Stack) error {
-	for _, stack := range stacks {
-		prov.stacksMut.RLock()
-		// because of the way the provisioner initializes,
-		// we can't guarantee that we have a valid client/stack setup, as we might have just re-initialized from a file.
-		// If that's the case, create a new client
-		stackRef, ok := prov.stacks[stack.ID]
-		prov.stacksMut.RUnlock()
-		// we can't reference the client, it won't be created when we just run mage:clean
-		// instead, grab the project ID from `stacks`, create a new client
-		if ok {
-			err := stackRef.client.DeleteDeployment()
-			if err != nil {
-				prov.log.Logf("error removing deployment: %w", err)
-			}
-		} else {
-			// create a new client
-			client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log)
-			dep_id, ok := stack.Internal["deployment_id"]
-			if !ok {
-				return fmt.Errorf("could not find deployment_id for serverless")
-			}
-			dep_id_str, ok := dep_id.(string)
-			if !ok {
-				return fmt.Errorf("deployment_id is not a string: %v", dep_id)
-			}
-			client.proj.ID = dep_id_str
-
-			dep_type, ok := stack.Internal["deployment_type"]
-			if !ok {
-				return fmt.Errorf("could not find deployment_type in stack for serverless")
-			}
-			dep_type_str, ok := dep_type.(string)
-			if !ok {
-				return fmt.Errorf("deployment_type is not a string: %v", dep_id_str)
-			}
-			client.proj.Type = dep_type_str
-			err := client.DeleteDeployment()
-			if err != nil {
-				return fmt.Errorf("error removing deployment after re-creating client: %w", err)
-			}
+// Delete deletes a stack.
+func (prov *ServerlessProvision) Delete(ctx context.Context, stack runner.Stack) error {
+	deploymentID, deploymentType, err := prov.getDeploymentInfo(stack)
+	if err != nil {
+		return fmt.Errorf("failed to get deployment info from the stack: %w", err)
+	}
 
-		}
+	client := NewServerlessClient(prov.cfg.Region, "observability", prov.cfg.APIKey, prov.log)
+	client.proj.ID = deploymentID
+	client.proj.Type = deploymentType
+	client.proj.Region = prov.cfg.Region
+	client.proj.Endpoints.Elasticsearch = stack.Elasticsearch
+	client.proj.Endpoints.Kibana = stack.Kibana
+	client.proj.Credentials.Username = stack.Username
+	client.proj.Credentials.Password = stack.Password
+
+	prov.log.Logf("Destroying serverless stack %s [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
+	err = client.DeleteDeployment()
+	if err != nil {
+		return fmt.Errorf("error removing serverless stack %s [stack_id: %s, deployment_id: %s]: %w", stack.Version, stack.ID, deploymentID, err)
 	}
 	return nil
 }
@@ -235,3 +230,26 @@ func (prov *ServerlessProvision) CheckCloudRegion() error {
 
 	return nil
 }
+
+func (prov *ServerlessProvision) getDeploymentInfo(stack runner.Stack) (string, string, error) {
+	if stack.Internal == nil {
+		return "", "", fmt.Errorf("missing internal information")
+	}
+	deploymentIDRaw, ok := stack.Internal["deployment_id"]
+	if !ok {
+		return "", "", fmt.Errorf("missing internal deployment_id")
+	}
+	deploymentID, ok := deploymentIDRaw.(string)
+	if !ok {
+		return "", "", fmt.Errorf("internal deployment_id not a string")
+	}
+	deploymentTypeRaw, ok := stack.Internal["deployment_type"]
+	if !ok {
+		return "", "", fmt.Errorf("missing internal deployment_type")
+	}
+	deploymentType, ok := deploymentTypeRaw.(string)
+	if !ok {
+		return "", "", fmt.Errorf("internal deployment_type is not a string")
+	}
+	return deploymentID, deploymentType, nil
+}
diff --git a/pkg/testing/ess/serverless_test.go b/pkg/testing/ess/serverless_test.go
index 7f69e49819f..2fc8e0075b1 100644
--- a/pkg/testing/ess/serverless_test.go
+++ b/pkg/testing/ess/serverless_test.go
@@ -26,9 +26,8 @@ func TestProvisionGetRegions(t *testing.T) {
 
 	cfg := ProvisionerConfig{Region: "bad-region-ID", APIKey: key}
 	prov := &ServerlessProvision{
-		cfg:    cfg,
-		stacks: map[string]stackhandlerData{},
-		log:    &defaultLogger{wrapped: logp.L()},
+		cfg: cfg,
+		log: &defaultLogger{wrapped: logp.L()},
 	}
 	err = prov.CheckCloudRegion()
 	require.NoError(t, err)
@@ -48,27 +47,23 @@ func TestStackProvisioner(t *testing.T) {
 	cfg := ProvisionerConfig{Region: "aws-eu-west-1", APIKey: key}
 	provClient, err := NewServerlessProvisioner(cfg)
 	require.NoError(t, err)
-	stacks := []runner.StackRequest{
-		{ID: "stack-test-one", Version: "8.9.0"},
-		{ID: "stack-test-two", Version: "8.9.0"},
-	}
+	request := runner.StackRequest{ID: "stack-test-one", Version: "8.9.0"}
 
 	ctx, cancel := context.WithTimeout(context.Background(), time.Minute*5)
 	defer cancel()
-	res, err := provClient.Provision(ctx, stacks)
+	stack, err := provClient.Create(ctx, request)
 	require.NoError(t, err)
 	t.Logf("got results:")
-	for _, stack := range res {
-		t.Logf("stack: %#v", stack)
-		require.NotEmpty(t, stack.Elasticsearch)
-		require.NotEmpty(t, stack.Kibana)
-		require.NotEmpty(t, stack.Password)
-		require.NotEmpty(t, stack.Username)
-	}
+	t.Logf("stack: %#v", stack)
+	require.NotEmpty(t, stack.Elasticsearch)
+	require.NotEmpty(t, stack.Kibana)
+	require.NotEmpty(t, stack.Password)
+	require.NotEmpty(t, stack.Username)
+	stack, err = provClient.WaitForReady(ctx, stack)
+	require.NoError(t, err)
 	t.Logf("tearing down...")
-	err = provClient.Clean(ctx, res)
+	err = provClient.Delete(ctx, stack)
 	require.NoError(t, err)
-
 }
 
 func TestStartServerless(t *testing.T) {
diff --git a/pkg/testing/runner/provisioner.go b/pkg/testing/runner/provisioner.go
index fb474e2f307..2708b0d204d 100644
--- a/pkg/testing/runner/provisioner.go
+++ b/pkg/testing/runner/provisioner.go
@@ -57,6 +57,9 @@ type Stack struct {
 	// Version is the version of the stack.
 	Version string `yaml:"version"`
 
+	// Ready determines if the stack is ready to be used.
+	Ready bool `yaml:"ready"`
+
 	// Elasticsearch is the URL to communicate with elasticsearch.
 	Elasticsearch string `yaml:"elasticsearch"`
 
@@ -89,11 +92,12 @@ type StackProvisioner interface {
 	// SetLogger sets the logger for it to use.
 	SetLogger(l Logger)
 
-	// Provision brings up the stacks
-	//
-	// The provision should re-use already prepared stacks when possible.
-	Provision(ctx context.Context, requests []StackRequest) ([]Stack, error)
+	// Create creates a stack.
+	Create(ctx context.Context, request StackRequest) (Stack, error)
 
-	// Clean cleans up all provisioned resources.
-	Clean(ctx context.Context, stacks []Stack) error
+	// WaitForReady should block until the stack is ready or the context is cancelled.
+	WaitForReady(ctx context.Context, stack Stack) (Stack, error)
+
+	// Delete deletes the stack.
+	Delete(ctx context.Context, stack Stack) error
 }
diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go
index 0541fa785ec..a2c77f77aa0 100644
--- a/pkg/testing/runner/runner.go
+++ b/pkg/testing/runner/runner.go
@@ -128,10 +128,11 @@ type Runner struct {
 	ip     InstanceProvisioner
 	sp     StackProvisioner
 
-	batches      []OSBatch
-	batchToStack map[string]Stack
-	stacksReady  sync.WaitGroup
-	stacksErr    error
+	batches []OSBatch
+
+	batchToStack   map[string]stackRes
+	batchToStackCh map[string]chan stackRes
+	batchToStackMx sync.Mutex
 
 	stateMx sync.Mutex
 	state   State
@@ -172,12 +173,13 @@ func NewRunner(cfg Config, ip InstanceProvisioner, sp StackProvisioner, batches
 	osBatches = filterSupportedOS(osBatches, ip)
 
 	r := &Runner{
-		cfg:          cfg,
-		logger:       logger,
-		ip:           ip,
-		sp:           sp,
-		batches:      osBatches,
-		batchToStack: make(map[string]Stack),
+		cfg:            cfg,
+		logger:         logger,
+		ip:             ip,
+		sp:             sp,
+		batches:        osBatches,
+		batchToStack:   make(map[string]stackRes),
+		batchToStackCh: make(map[string]chan stackRes),
 	}
 
 	err = r.loadState()
@@ -274,11 +276,15 @@ func (r *Runner) Clean() error {
 		defer cancel()
 		return r.ip.Clean(ctx, r.cfg, instances)
 	})
-	g.Go(func() error {
-		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
-		defer cancel()
-		return r.sp.Clean(ctx, stacks)
-	})
+	for _, stack := range stacks {
+		g.Go(func(stack Stack) func() error {
+			return func() error {
+				ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+				defer cancel()
+				return r.sp.Delete(ctx, stack)
+			}
+		}(stack))
+	}
 	return g.Wait()
 }
 
@@ -366,14 +372,10 @@ func (r *Runner) runInstance(ctx context.Context, sshAuth ssh.AuthMethod, logger
 	// ensure that we have all the requirements for the stack if required
 	if batch.Batch.Stack != nil {
 		// wait for the stack to be ready before continuing
-		logger.Logf("Waiting for stacks to be ready...")
-		r.stacksReady.Wait()
-		if r.stacksErr != nil {
-			return OSRunnerResult{}, fmt.Errorf("%s unable to continue because stack never became ready: %w", instance.Name, r.stacksErr)
-		}
-		stack, ok := r.getStackForBatchID(batch.ID)
-		if !ok {
-			return OSRunnerResult{}, fmt.Errorf("failed to find stack for batch %s", batch.ID)
+		logger.Logf("Waiting for stack to be ready...")
+		stack, err := r.getStackForBatchID(batch.ID)
+		if err != nil {
+			return OSRunnerResult{}, err
 		}
 		env["ELASTICSEARCH_HOST"] = stack.Elasticsearch
 		env["ELASTICSEARCH_USERNAME"] = stack.Username
@@ -574,9 +576,6 @@ func (r *Runner) createRepoArchive(ctx context.Context, repoDir string, dir stri
 
 // startStacks starts the stacks required for the tests to run
 func (r *Runner) startStacks(ctx context.Context) error {
-	// stacks never start ready
-	r.stacksReady.Add(1)
-
 	var versions []string
 	batchToVersion := make(map[string]string)
 	for _, lb := range r.batches {
@@ -592,59 +591,115 @@ func (r *Runner) startStacks(ctx context.Context) error {
 		}
 	}
 
-	var requests []StackRequest
+	var requests []stackReq
 	for _, version := range versions {
 		id := strings.Replace(version, ".", "", -1)
 		stack, ok := r.findStack(id)
 		if ok {
-			r.logger.Logf("Reusing stack %s (%s)", version, id)
-			for batchID, batchVersion := range batchToVersion {
-				if batchVersion == version {
-					r.batchToStack[batchID] = stack
-				}
-			}
+			requests = append(requests, stackReq{
+				request: StackRequest{
+					ID:      id,
+					Version: version,
+				},
+				stack: &stack,
+			})
 		} else {
-			requests = append(requests, StackRequest{
-				ID:      id,
-				Version: version,
+			requests = append(requests, stackReq{
+				request: StackRequest{
+					ID:      id,
+					Version: version,
+				},
 			})
 		}
 	}
-	if len(requests) == 0 {
-		// no need to request any other stacks
-		r.stacksReady.Done()
-		return nil
+
+	reportResult := func(version string, stack Stack, err error) {
+		r.batchToStackMx.Lock()
+		defer r.batchToStackMx.Unlock()
+		res := stackRes{
+			stack: stack,
+			err:   err,
+		}
+		for batchID, batchVersion := range batchToVersion {
+			if batchVersion == version {
+				r.batchToStack[batchID] = res
+				ch, ok := r.batchToStackCh[batchID]
+				if ok {
+					ch <- res
+				}
+			}
+		}
 	}
 
-	// start go routine to provision the needed stacks
-	go func(ctx context.Context) {
-		defer r.stacksReady.Done()
+	// start goroutines to provision the needed stacks
+	for _, request := range requests {
+		go func(ctx context.Context, req stackReq) {
+			var err error
+			var stack Stack
+			if req.stack != nil {
+				stack = *req.stack
+			} else {
+				stack, err = r.sp.Create(ctx, req.request)
+				if err != nil {
+					reportResult(req.request.Version, stack, err)
+					return
+				}
+				err = r.addOrUpdateStack(stack)
+				if err != nil {
+					reportResult(stack.Version, stack, err)
+					return
+				}
+			}
+
+			if stack.Ready {
+				reportResult(stack.Version, stack, nil)
+				return
+			}
 
-		stacks, err := r.sp.Provision(ctx, requests)
-		if err != nil {
-			r.stacksErr = err
-			return
-		}
-		for _, stack := range stacks {
-			err := r.addOrUpdateStack(stack)
+			stack, err = r.sp.WaitForReady(ctx, stack)
 			if err != nil {
-				r.stacksErr = err
+				reportResult(stack.Version, stack, err)
 				return
 			}
-			for batchID, batchVersion := range batchToVersion {
-				if batchVersion == stack.Version {
-					r.batchToStack[batchID] = stack
-				}
+
+			err = r.addOrUpdateStack(stack)
+			if err != nil {
+				reportResult(stack.Version, stack, err)
+				return
 			}
-		}
-	}(ctx)
+
+			reportResult(stack.Version, stack, nil)
+		}(ctx, request)
+	}
 
 	return nil
 }
 
-func (r *Runner) getStackForBatchID(id string) (Stack, bool) {
-	stack, ok := r.batchToStack[id]
-	return stack, ok
+func (r *Runner) getStackForBatchID(id string) (Stack, error) {
+	r.batchToStackMx.Lock()
+	res, ok := r.batchToStack[id]
+	if ok {
+		r.batchToStackMx.Unlock()
+		return res.stack, res.err
+	}
+	_, ok = r.batchToStackCh[id]
+	if ok {
+		return Stack{}, fmt.Errorf("getStackForBatchID called twice; this is not allowed")
+	}
+	ch := make(chan stackRes, 1)
+	r.batchToStackCh[id] = ch
+	r.batchToStackMx.Unlock()
+
+	// 12 minutes is because the stack should have been ready after 10 minutes or returned an error
+	// this only exists to ensure that if that code is not blocking that this doesn't block forever
+	t := time.NewTimer(12 * time.Minute)
+	defer t.Stop()
+	select {
+	case <-t.C:
+		return Stack{}, fmt.Errorf("failed waiting for a response after 12 minutes")
+	case res = <-ch:
+		return res.stack, res.err
+	}
 }
 
 func (r *Runner) findInstance(id string) (StateInstance, bool) {
@@ -986,3 +1041,13 @@ type batchLogger struct {
 func (b *batchLogger) Logf(format string, args ...any) {
 	b.wrapped.Logf("(%s) %s", b.prefix, fmt.Sprintf(format, args...))
 }
+
+type stackRes struct {
+	stack Stack
+	err   error
+}
+
+type stackReq struct {
+	request StackRequest
+	stack   *Stack
+}
diff --git a/pkg/testing/runner/runner_test.go b/pkg/testing/runner/runner_test.go
index d10b9d524d0..c46b3b53761 100644
--- a/pkg/testing/runner/runner_test.go
+++ b/pkg/testing/runner/runner_test.go
@@ -8,6 +8,7 @@ import (
 	"context"
 	"os"
 	"path/filepath"
+	"sync"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -86,7 +87,7 @@ func TestNewRunner_Clean(t *testing.T) {
 	require.NoError(t, err)
 
 	assert.ElementsMatch(t, ip.instances, []Instance{i1, i2})
-	assert.ElementsMatch(t, sp.stacks, []Stack{s1, s2})
+	assert.ElementsMatch(t, sp.deletedStacks, []Stack{s1, s2})
 }
 
 type fakeInstanceProvisioner struct {
@@ -123,31 +124,38 @@ func (f *fakeInstanceProvisioner) Clean(_ context.Context, _ Config, instances [
 }
 
 type fakeStackProvisioner struct {
-	requests []StackRequest
-	stacks   []Stack
+	mx            sync.Mutex
+	requests      []StackRequest
+	deletedStacks []Stack
 }
 
 func (f *fakeStackProvisioner) SetLogger(_ Logger) {
 }
 
-func (f *fakeStackProvisioner) Provision(_ context.Context, requests []StackRequest) ([]Stack, error) {
-	f.requests = requests
-	var stacks []Stack
-	for _, req := range requests {
-		stacks = append(stacks, Stack{
-			ID:            req.ID,
-			Version:       req.Version,
-			Elasticsearch: "http://localhost:9200",
-			Kibana:        "http://localhost:5601",
-			Username:      "elastic",
-			Password:      "changeme",
-			Internal:      nil,
-		})
-	}
-	return stacks, nil
+func (f *fakeStackProvisioner) Create(_ context.Context, request StackRequest) (Stack, error) {
+	f.mx.Lock()
+	defer f.mx.Unlock()
+	f.requests = append(f.requests, request)
+	return Stack{
+		ID:            request.ID,
+		Version:       request.Version,
+		Elasticsearch: "http://localhost:9200",
+		Kibana:        "http://localhost:5601",
+		Username:      "elastic",
+		Password:      "changeme",
+		Internal:      nil,
+		Ready:         false,
+	}, nil
+}
+
+func (f *fakeStackProvisioner) WaitForReady(_ context.Context, stack Stack) (Stack, error) {
+	stack.Ready = true
+	return stack, nil
 }
 
-func (f *fakeStackProvisioner) Clean(_ context.Context, stacks []Stack) error {
-	f.stacks = stacks
+func (f *fakeStackProvisioner) Delete(_ context.Context, stack Stack) error {
+	f.mx.Lock()
+	defer f.mx.Unlock()
+	f.deletedStacks = append(f.deletedStacks, stack)
 	return nil
 }

From d1fffb3cf34bd9c619004fc92165afb351fed8d7 Mon Sep 17 00:00:00 2001
From: Anderson Queiroz <anderson.queiroz@elastic.co>
Date: Tue, 14 Nov 2023 07:16:23 -0300
Subject: [PATCH 3/3] multipass: ensure instance does not exist (#3714)

Before the mutipass provisioner tries to launch an instance, if it already exists, it tries to delete and purge the instance.
---
 pkg/testing/multipass/provisioner.go       | 88 +++++++++++++++++++++-
 testing/integration/logs_ingestion_test.go | 18 +++--
 2 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/pkg/testing/multipass/provisioner.go b/pkg/testing/multipass/provisioner.go
index 2be04b806fc..7703dc7e335 100644
--- a/pkg/testing/multipass/provisioner.go
+++ b/pkg/testing/multipass/provisioner.go
@@ -7,6 +7,7 @@ package multipass
 import (
 	"bytes"
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
@@ -122,6 +123,12 @@ func (p *provisioner) Clean(ctx context.Context, _ runner.Config, instances []ru
 
 // launch creates an instance.
 func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runner.OSBatch) error {
+	// check if instance already exists
+	err := p.ensureInstanceNotExist(ctx, batch)
+	if err != nil {
+		p.logger.Logf(
+			"could not check multipass instance %q does not exists, moving on anyway. Err: %v", err)
+	}
 	args := []string{
 		"launch",
 		"-c", "2",
@@ -145,9 +152,14 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne
 		return fmt.Errorf("failed to marshal cloud-init configuration: %w", err)
 	}
 
+	p.logger.Logf("Launching multipass instance %s", batch.ID)
 	var output bytes.Buffer
-	p.logger.Logf("Launching multipass image %s", batch.ID)
-	proc, err := process.Start("multipass", process.WithContext(ctx), process.WithArgs(args), process.WithCmdOptions(runner.AttachOut(&output), runner.AttachErr(&output)))
+	proc, err := process.Start("multipass",
+		process.WithContext(ctx),
+		process.WithArgs(args),
+		process.WithCmdOptions(
+			runner.AttachOut(&output),
+			runner.AttachErr(&output)))
 	if err != nil {
 		return fmt.Errorf("failed to run multipass launch: %w", err)
 	}
@@ -162,7 +174,7 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne
 	}
 	_ = proc.Stdin.Close()
 	ps := <-proc.Wait()
-	if ps.ExitCode() != 0 {
+	if !ps.Success() {
 		// print the output so its clear what went wrong
 		fmt.Fprintf(os.Stdout, "%s\n", output.Bytes())
 		return fmt.Errorf("failed to run multipass launch: exited with code: %d", ps.ExitCode())
@@ -170,6 +182,76 @@ func (p *provisioner) launch(ctx context.Context, cfg runner.Config, batch runne
 	return nil
 }
 
+func (p *provisioner) ensureInstanceNotExist(ctx context.Context, batch runner.OSBatch) error {
+	var output bytes.Buffer
+	var stdErr bytes.Buffer
+	proc, err := process.Start("multipass",
+		process.WithContext(ctx),
+		process.WithArgs([]string{"list", "--format", "json"}),
+		process.WithCmdOptions(
+			runner.AttachOut(&output),
+			runner.AttachErr(&stdErr)))
+	if err != nil {
+		return fmt.Errorf("multipass list failed to run: %w", err)
+	}
+
+	state := <-proc.Wait()
+	if !state.Success() {
+		msg := fmt.Sprintf("multipass list exited with non-zero status: %s",
+			state.String())
+		p.logger.Logf(msg)
+		p.logger.Logf("output: %s", output.String())
+		p.logger.Logf("stderr: %s", stdErr.String())
+		return fmt.Errorf(msg)
+	}
+	list := struct {
+		List []struct {
+			Ipv4    []string `json:"ipv4"`
+			Name    string   `json:"name"`
+			Release string   `json:"release"`
+			State   string   `json:"state"`
+		} `json:"list"`
+	}{}
+	err = json.NewDecoder(&output).Decode(&list)
+	if err != nil {
+		return fmt.Errorf("could not decode mutipass list output: %w", err)
+	}
+
+	for _, i := range list.List {
+		if i.Name == batch.ID {
+			p.logger.Logf("multipass trying to delete instance %s", batch.ID)
+
+			output.Reset()
+			stdErr.Reset()
+			proc, err = process.Start("multipass",
+				process.WithContext(ctx),
+				process.WithArgs([]string{"delete", "--purge", batch.ID}),
+				process.WithCmdOptions(
+					runner.AttachOut(&output),
+					runner.AttachErr(&stdErr)))
+			if err != nil {
+				return fmt.Errorf(
+					"multipass instance %q already exist, state %q. Could not delete it: %w",
+					batch.ID, i.State, err)
+			}
+			state = <-proc.Wait()
+			if !state.Success() {
+				msg := fmt.Sprintf("failed to delete and purge multipass instance %s: %s",
+					batch.ID,
+					state.String())
+				p.logger.Logf(msg)
+				p.logger.Logf("output: %s", output.String())
+				p.logger.Logf("stderr: %s", stdErr.String())
+				return fmt.Errorf(msg)
+			}
+
+			break
+		}
+	}
+
+	return nil
+}
+
 // delete deletes an instance.
 func (p *provisioner) delete(ctx context.Context, instance runner.Instance) error {
 	args := []string{
diff --git a/testing/integration/logs_ingestion_test.go b/testing/integration/logs_ingestion_test.go
index ba9a84673b0..d9fb2f511a8 100644
--- a/testing/integration/logs_ingestion_test.go
+++ b/testing/integration/logs_ingestion_test.go
@@ -104,11 +104,15 @@ func testMonitoringLogsAreShipped(
 ) {
 	// Stage 1: Make sure metricbeat logs are populated
 	t.Log("Making sure metricbeat logs are populated")
-	docs := findESDocs(t, func() (estools.Documents, error) {
-		return estools.GetLogsForDataset(info.ESClient, "elastic_agent.metricbeat")
-	})
-	t.Logf("metricbeat: Got %d documents", len(docs.Hits.Hits))
-	require.NotZero(t, len(docs.Hits.Hits))
+	require.Eventually(t,
+		func() bool {
+			docs := findESDocs(t, func() (estools.Documents, error) {
+				return estools.GetLogsForDataset(info.ESClient, "elastic_agent.metricbeat")
+			})
+			return len(docs.Hits.Hits) > 0
+		},
+		1*time.Minute, 500*time.Millisecond,
+		"there should be metricbeats logs by now")
 
 	// Stage 2: make sure all components are healthy
 	t.Log("Making sure all components are healthy")
@@ -123,7 +127,7 @@ func testMonitoringLogsAreShipped(
 
 	// Stage 3: Make sure there are no errors in logs
 	t.Log("Making sure there are no error logs")
-	docs = findESDocs(t, func() (estools.Documents, error) {
+	docs := findESDocs(t, func() (estools.Documents, error) {
 		return estools.CheckForErrorsInLogs(info.ESClient, info.Namespace, []string{
 			// acceptable error messages (include reason)
 			"Error dialing dial tcp 127.0.0.1:9200: connect: connection refused", // beat is running default config before its config gets updated
@@ -134,7 +138,7 @@ func testMonitoringLogsAreShipped(
 			"elastic-agent-client error: rpc error: code = Canceled desc = context canceled", // can happen on restart
 		})
 	})
-	t.Logf("errors: Got %d documents", len(docs.Hits.Hits))
+	t.Logf("error logs: Got %d documents", len(docs.Hits.Hits))
 	for _, doc := range docs.Hits.Hits {
 		t.Logf("%#v", doc.Source)
 	}