diff --git a/.buildkite/scripts/steps/integration_tests.sh b/.buildkite/scripts/steps/integration_tests.sh index b747d73b5a8..7c2299d7066 100755 --- a/.buildkite/scripts/steps/integration_tests.sh +++ b/.buildkite/scripts/steps/integration_tests.sh @@ -4,7 +4,11 @@ set -euxo pipefail source .buildkite/scripts/common.sh # PACKAGE +<<<<<<< HEAD DEV=true EXTERNAL=true SNAPSHOT=true PLATFORMS=linux/amd64,linux/arm64 PACKAGES=tar.gz mage package +======= +AGENT_PACKAGE_VERSION="${OVERRIDE_AGENT_PACKAGE_VERSION}" DEV=true EXTERNAL=true SNAPSHOT=true PLATFORMS=linux/amd64,linux/arm64,windows/amd64 PACKAGES=tar.gz,zip mage package +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) # Run integration tests set +e diff --git a/Makefile b/Makefile index 030ff00a8cd..915e3808126 100644 --- a/Makefile +++ b/Makefile @@ -15,8 +15,9 @@ ifndef MAGE_PRESENT @echo Installing mage $(MAGE_VERSION). @go install ${MAGE_IMPORT_PATH}@$(MAGE_VERSION) @-mage -clean +else + @echo Mage $(MAGE_VERSION) already installed. endif - @true ## help : Show this help. diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index 83e31d38366..cf618cc3fe2 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -116,6 +116,9 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error { // cleanup older versions, // in windows it might leave self untouched, this will get cleaned up // later at the start, because for windows we leave marker untouched. + // + // Why is this being skipped on Windows? The comment above is not clear. + // issue: https://github.com/elastic/elastic-agent/issues/3027 removeMarker := !isWindows() err = upgrade.Cleanup(log, marker.Hash, removeMarker, false) if err != nil { diff --git a/make.bat b/make.bat deleted file mode 100644 index 12cf4ea7327..00000000000 --- a/make.bat +++ /dev/null @@ -1,11 +0,0 @@ -@echo off - -REM Windows wrapper for Mage (https://magefile.org/) that installs it -REM to %GOPATH%\bin from the Beats vendor directory. -REM -REM After running this once you may invoke mage.exe directly. - -WHERE mage -IF %ERRORLEVEL% NEQ 0 go get github.com/magefile/mage - -mage %* diff --git a/pkg/testing/fixture.go b/pkg/testing/fixture.go index 112ad679c38..b38234fe26c 100644 --- a/pkg/testing/fixture.go +++ b/pkg/testing/fixture.go @@ -608,8 +608,11 @@ func (f *Fixture) prepareComponents(workDir string, components ...UsableComponen if err := copy.Copy(comp.BinaryPath, destBinary); err != nil { return fmt.Errorf("failed to copy %s to %s: %w", comp.BinaryPath, destBinary, err) } - if err := os.Chown(destBinary, os.Geteuid(), os.Getgid()); err != nil { - return fmt.Errorf("failed to chown %s: %w", destBinary, err) + if runtime.GOOS != "windows" { + // chown is not supported on Windows + if err := os.Chown(destBinary, os.Geteuid(), os.Getgid()); err != nil { + return fmt.Errorf("failed to chown %s: %w", destBinary, err) + } } if err := os.Chmod(destBinary, 0755); err != nil { return fmt.Errorf("failed to chmod %s: %w", destBinary, err) diff --git a/pkg/testing/fixture_install.go b/pkg/testing/fixture_install.go index 1740a8dae3e..b714e86d7b9 100644 --- a/pkg/testing/fixture_install.go +++ b/pkg/testing/fixture_install.go @@ -11,6 +11,7 @@ import ( "io/fs" "os" "path/filepath" + "runtime" "strconv" "strings" "time" @@ -145,6 +146,29 @@ func (f *Fixture) Install(ctx context.Context, installOpts *InstallOpts, opts .. } require.NoErrorf(f.t, err, "uninstalling agent failed. Output: %q", out) } +<<<<<<< HEAD +======= + + // 5 minute timeout, to ensure that it at least doesn't get stuck. + // original context is not used as it could have a timeout on the context + // for the install and we don't want that context to prevent the uninstall + uninstallCtx, uninstallCancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer uninstallCancel() + out, err := f.Uninstall(uninstallCtx, &UninstallOpts{Force: true, UninstallToken: f.uninstallToken}) + f.setClient(nil) + if err != nil && + (errors.Is(err, ErrNotInstalled) || + strings.Contains( + err.Error(), + "elastic-agent: no such file or directory")) { + f.t.Logf("fixture.Install Cleanup: agent was already uninstalled, skipping uninstall") + // Agent fixture has already been uninstalled, perhaps by + // an explicit call to fixture.Uninstall, so nothing needs + // to be done here. + return + } + require.NoErrorf(f.t, err, "uninstalling agent failed. Output: %q", out) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) }) return out, nil @@ -177,6 +201,7 @@ func (f *Fixture) Uninstall(ctx context.Context, uninstallOpts *UninstallOpts, o if err != nil { return out, fmt.Errorf("error running uninstall command: %w", err) } + f.installed = false // Check that Elastic Agent files are actually removed basePath := f.installOpts.BasePath @@ -216,11 +241,46 @@ func (f *Fixture) collectDiagnostics() { f.t.Logf("failed to collect diagnostics; failed to create %s: %s", diagPath, err) return } +<<<<<<< HEAD outputPath := filepath.Join(diagPath, fmt.Sprintf("%s-diagnostics-%s.zip", f.t.Name(), time.Now().Format(time.RFC3339))) +======= + + stamp := time.Now().Format(time.RFC3339) + if runtime.GOOS == "windows" { + // on Windows a filename cannot contain a ':' as this collides with disk labels (aka. C:\) + stamp = strings.ReplaceAll(stamp, ":", "-") + } + + // Sub-test names are separated by "/" characters which are not valid filenames on Linux. + sanitizedTestName := strings.ReplaceAll(f.t.Name(), "/", "-") + outputPath := filepath.Join(diagPath, fmt.Sprintf("%s-diagnostics-%s.zip", sanitizedTestName, stamp)) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) output, err := f.Exec(ctx, []string{"diagnostics", "-f", outputPath}) if err != nil { f.t.Logf("failed to collect diagnostics to %s (%s): %s", outputPath, err, output) +<<<<<<< HEAD +======= + + // possible that the test was so fast that the Elastic Agent was just installed, the control protocol is + // not fully running yet. wait 15 seconds to try again, ensuring that best effort is performed in fetching + // diagnostics + if strings.Contains(string(output), "connection error") { + f.t.Logf("retrying in 15 seconds due to connection error; possible Elastic Agent was not fully started") + time.Sleep(15 * time.Second) + output, err = f.Exec(ctx, []string{"diagnostics", "-f", outputPath}) + f.t.Logf("failed to collect diagnostics a second time at %s (%s): %s", outputPath, err, output) + } + if err != nil { + // If collecting diagnostics fails, zip up the entire installation directory with the hope that it will contain logs. + f.t.Logf("creating zip archive of the installation directory: %s", f.workDir) + zipPath := filepath.Join(diagPath, fmt.Sprintf("%s-install-directory-%s.zip", sanitizedTestName, time.Now().Format(time.RFC3339))) + err = f.archiveInstallDirectory(f.workDir, zipPath) + if err != nil { + f.t.Logf("failed to zip install directory to %s: %s", zipPath, err) + } + } +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) } } diff --git a/pkg/testing/ogc/provisioner.go b/pkg/testing/ogc/provisioner.go index 152285d27c0..4bcc096e2a6 100644 --- a/pkg/testing/ogc/provisioner.go +++ b/pkg/testing/ogc/provisioner.go @@ -284,7 +284,11 @@ func osBatchToOGC(cacheDir string, batch runner.OSBatch) Layout { LayoutIntegrationTag, batch.OS.Type, batch.OS.Arch, - strings.ToLower(fmt.Sprintf("%s-%s", batch.OS.Distro, strings.Replace(batch.OS.Version, ".", "-", -1))), + } + if batch.OS.Type == define.Linux { + tags = append(tags, strings.ToLower(fmt.Sprintf("%s-%s", batch.OS.Distro, strings.Replace(batch.OS.Version, ".", "-", -1)))) + } else { + tags = append(tags, strings.ToLower(fmt.Sprintf("%s-%s", batch.OS.Type, strings.Replace(batch.OS.Version, ".", "-", -1)))) } if batch.Batch.Isolate { tags = append(tags, "isolate") diff --git a/pkg/testing/ogc/supported.go b/pkg/testing/ogc/supported.go index 6a70ab3b9de..34ae7d86c89 100644 --- a/pkg/testing/ogc/supported.go +++ b/pkg/testing/ogc/supported.go @@ -9,6 +9,11 @@ import ( "github.com/elastic/elastic-agent/pkg/testing/runner" ) +const ( + // Google is for the Google Cloud Platform (GCP) + Google = "google" +) + // ogcSupported defines the set of supported OS's the OGC provisioner currently supports. // // In the case that a batch is not specific on the version and/or distro the first @@ -22,7 +27,7 @@ var ogcSupported = []LayoutOS{ Distro: runner.Ubuntu, Version: "22.04", }, - Provider: runner.Google, + Provider: Google, InstanceSize: "e2-standard-2", // 2 amd64 cpus RunsOn: "ubuntu-2204-lts", Username: "ubuntu", @@ -35,7 +40,7 @@ var ogcSupported = []LayoutOS{ Distro: runner.Ubuntu, Version: "20.04", }, - Provider: runner.Google, + Provider: Google, InstanceSize: "e2-standard-2", // 2 amd64 cpus RunsOn: "ubuntu-2004-lts", Username: "ubuntu", @@ -48,7 +53,7 @@ var ogcSupported = []LayoutOS{ Distro: runner.Ubuntu, Version: "22.04", }, - Provider: runner.Google, + Provider: Google, InstanceSize: "t2a-standard-2", // 2 arm64 cpus RunsOn: "ubuntu-2204-lts-arm64", Username: "ubuntu", @@ -61,10 +66,82 @@ var ogcSupported = []LayoutOS{ Distro: runner.Ubuntu, Version: "20.04", }, - Provider: runner.Google, + Provider: Google, InstanceSize: "t2a-standard-2", // 2 arm64 cpus RunsOn: "ubuntu-2004-lts-arm64", Username: "ubuntu", RemotePath: "/home/ubuntu/agent", }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2022", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2022", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2022-core", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2022-core", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2019", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2019", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2019-core", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2019-core", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2016", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2016", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, + { + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2016-core", + }, + Provider: Google, + InstanceSize: "e2-standard-4", // 4 amd64 cpus + RunsOn: "windows-2016-core", + Username: "windows", + RemotePath: "C:\\Users\\windows\\agent", + }, } diff --git a/pkg/testing/runner/debian.go b/pkg/testing/runner/debian.go index c85611835c4..5e53516b9ff 100644 --- a/pkg/testing/runner/debian.go +++ b/pkg/testing/runner/debian.go @@ -13,8 +13,6 @@ import ( "strings" "time" - "golang.org/x/crypto/ssh" - "github.com/elastic/elastic-agent/pkg/testing/define" ) @@ -22,7 +20,7 @@ import ( type DebianRunner struct{} // Prepare the test -func (DebianRunner) Prepare(ctx context.Context, sshClient *ssh.Client, logger Logger, arch string, goVersion string) error { +func (DebianRunner) Prepare(ctx context.Context, sshClient SSHClient, logger Logger, arch string, goVersion string) error { // prepare build-essential and unzip // // apt-get update and install are so terrible that we have to place this in a loop, because in some cases the @@ -35,7 +33,7 @@ func (DebianRunner) Prepare(ctx context.Context, sshClient *ssh.Client, logger L logger.Logf("Running apt-get update") // `-o APT::Update::Error-Mode=any` ensures that any warning is tried as an error, so the retry // will occur (without this we get random failures) - stdOut, errOut, err := sshRunCommandWithRetry(updateCtx, sshClient, "sudo", []string{"apt-get", "update", "-o APT::Update::Error-Mode=any"}, 15*time.Second) + stdOut, errOut, err := sshClient.ExecWithRetry(updateCtx, "sudo", []string{"apt-get", "update", "-o APT::Update::Error-Mode=any"}, 15*time.Second) if err != nil { return fmt.Errorf("failed to run apt-get update: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } @@ -45,7 +43,7 @@ func (DebianRunner) Prepare(ctx context.Context, sshClient *ssh.Client, logger L installCtx, installCancel := context.WithTimeout(ctx, 1*time.Minute) defer installCancel() logger.Logf("Install build-essential and unzip") - stdOut, errOut, err = sshRunCommandWithRetry(installCtx, sshClient, "sudo", []string{"apt-get", "install", "-y", "build-essential", "unzip"}, 5*time.Second) + stdOut, errOut, err = sshClient.ExecWithRetry(installCtx, "sudo", []string{"apt-get", "install", "-y", "build-essential", "unzip"}, 5*time.Second) if err != nil { return fmt.Errorf("failed to install build-essential and unzip: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } @@ -68,19 +66,19 @@ func (DebianRunner) Prepare(ctx context.Context, sshClient *ssh.Client, logger L logger.Logf("Install golang %s (%s)", goVersion, arch) downloadURL := fmt.Sprintf("https://go.dev/dl/go%s.linux-%s.tar.gz", goVersion, arch) filename := path.Base(downloadURL) - stdOut, errOut, err := sshRunCommand(ctx, sshClient, "curl", []string{"-Ls", downloadURL, "--output", filename}, nil) + stdOut, errOut, err := sshClient.Exec(ctx, "curl", []string{"-Ls", downloadURL, "--output", filename}, nil) if err != nil { return fmt.Errorf("failed to download go from %s with curl: %w (stdout: %s, stderr: %s)", downloadURL, err, stdOut, errOut) } - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "sudo", []string{"tar", "-C", "/usr/local", "-xzf", filename}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "sudo", []string{"tar", "-C", "/usr/local", "-xzf", filename}, nil) if err != nil { return fmt.Errorf("failed to extract go to /usr/local with tar: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "sudo", []string{"ln", "-s", "/usr/local/go/bin/go", "/usr/bin/go"}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "sudo", []string{"ln", "-s", "/usr/local/go/bin/go", "/usr/bin/go"}, nil) if err != nil { return fmt.Errorf("failed to symlink /usr/local/go/bin/go to /usr/bin/go: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "sudo", []string{"ln", "-s", "/usr/local/go/bin/gofmt", "/usr/bin/gofmt"}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "sudo", []string{"ln", "-s", "/usr/local/go/bin/gofmt", "/usr/bin/gofmt"}, nil) if err != nil { return fmt.Errorf("failed to symlink /usr/local/go/bin/gofmt to /usr/bin/gofmt: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } @@ -89,25 +87,25 @@ func (DebianRunner) Prepare(ctx context.Context, sshClient *ssh.Client, logger L } // Copy places the required files on the host. -func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logger, repoArchive string, build Build) error { +func (DebianRunner) Copy(ctx context.Context, sshClient SSHClient, logger Logger, repoArchive string, build Build) error { // copy the archive and extract it on the host logger.Logf("Copying repo") destRepoName := filepath.Base(repoArchive) - err := sshSCP(sshClient, repoArchive, destRepoName) + err := sshClient.Copy(repoArchive, destRepoName) if err != nil { return fmt.Errorf("failed to SCP repo archive %s: %w", repoArchive, err) } // ensure that agent directory is removed (possible it already exists if instance already used) - stdout, stderr, err := sshRunCommand(ctx, - sshClient, "sudo", []string{"rm", "-rf", "agent"}, nil) + stdout, stderr, err := sshClient.Exec(ctx, + "sudo", []string{"rm", "-rf", "agent"}, nil) if err != nil { return fmt.Errorf( "failed to remove agent directory before unziping new one: %w. stdout: %q, stderr: %q", err, stdout, stderr) } - stdOut, errOut, err := sshRunCommand(ctx, sshClient, "unzip", []string{destRepoName, "-d", "agent"}, nil) + stdOut, errOut, err := sshClient.Exec(ctx, "unzip", []string{destRepoName, "-d", "agent"}, nil) if err != nil { return fmt.Errorf("failed to unzip %s to agent directory: %w (stdout: %s, stderr: %s)", destRepoName, err, stdOut, errOut) } @@ -116,7 +114,7 @@ func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logg logger.Logf("Running make mage and prepareOnRemote") envs := `GOPATH="$HOME/go" PATH="$HOME/go/bin:$PATH"` installMage := strings.NewReader(fmt.Sprintf(`cd agent && %s make mage && %s mage integration:prepareOnRemote`, envs, envs)) - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "bash", nil, installMage) + stdOut, errOut, err = sshClient.Exec(ctx, "bash", nil, installMage) if err != nil { return fmt.Errorf("failed to perform make mage and prepareOnRemote: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) } @@ -130,7 +128,7 @@ func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logg return fmt.Errorf("failed to read local SHA52 contents %s: %w", build.SHA512Path, err) } hostSHA512Path := filepath.Base(build.SHA512Path) - hostSHA512, err := sshGetFileContents(ctx, sshClient, hostSHA512Path) + hostSHA512, err := sshClient.GetFileContents(ctx, hostSHA512Path) if err == nil { if string(localSHA512) == string(hostSHA512) { logger.Logf("Skipping copy agent build %s; already the same", filepath.Base(build.Path)) @@ -141,16 +139,16 @@ func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logg if copyBuild { // ensure the existing copies are removed first toRemove := filepath.Base(build.Path) - stdOut, errOut, err = sshRunCommand(ctx, - sshClient, "sudo", []string{"rm", "-f", toRemove}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, + "sudo", []string{"rm", "-f", toRemove}, nil) if err != nil { return fmt.Errorf("failed to remove %q: %w (stdout: %q, stderr: %q)", toRemove, err, stdOut, errOut) } toRemove = filepath.Base(build.SHA512Path) - stdOut, errOut, err = sshRunCommand(ctx, - sshClient, "sudo", []string{"rm", "-f", toRemove}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, + "sudo", []string{"rm", "-f", toRemove}, nil) if err != nil { return fmt.Errorf("failed to remove %q: %w (stdout: %q, stderr: %q)", toRemove, err, stdOut, errOut) @@ -161,17 +159,17 @@ func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logg for _, buildPath := range []string{build.Path, build.SHA512Path} { if copyBuild { - err = sshSCP(sshClient, buildPath, filepath.Base(buildPath)) + err = sshClient.Copy(buildPath, filepath.Base(buildPath)) if err != nil { return fmt.Errorf("failed to SCP build %s: %w", filepath.Base(buildPath), err) } } insideAgentDir := filepath.Join("agent", buildPath) - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "mkdir", []string{"-p", filepath.Dir(insideAgentDir)}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "mkdir", []string{"-p", filepath.Dir(insideAgentDir)}, nil) if err != nil { return fmt.Errorf("failed to create %s directory: %w (stdout: %s, stderr: %s)", filepath.Dir(insideAgentDir), err, stdOut, errOut) } - stdOut, errOut, err = sshRunCommand(ctx, sshClient, "ln", []string{filepath.Base(buildPath), insideAgentDir}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "ln", []string{filepath.Base(buildPath), insideAgentDir}, nil) if err != nil { return fmt.Errorf("failed to hard link %s to %s: %w (stdout: %s, stderr: %s)", filepath.Base(buildPath), insideAgentDir, err, stdOut, errOut) } @@ -181,7 +179,7 @@ func (DebianRunner) Copy(ctx context.Context, sshClient *ssh.Client, logger Logg } // Run the test -func (DebianRunner) Run(ctx context.Context, verbose bool, sshClient *ssh.Client, logger Logger, agentVersion string, prefix string, batch define.Batch, env map[string]string) (OSRunnerResult, error) { +func (DebianRunner) Run(ctx context.Context, verbose bool, sshClient SSHClient, logger Logger, agentVersion string, prefix string, batch define.Batch, env map[string]string) (OSRunnerResult, error) { var tests []string for _, pkg := range batch.Tests { for _, test := range pkg.Tests { @@ -229,11 +227,11 @@ func (DebianRunner) Run(ctx context.Context, verbose bool, sshClient *ssh.Client } // Diagnostics gathers any diagnostics from the host. -func (DebianRunner) Diagnostics(ctx context.Context, c *ssh.Client, logger Logger, destination string) error { +func (DebianRunner) Diagnostics(ctx context.Context, sshClient SSHClient, logger Logger, destination string) error { // take ownership, as sudo tests will create with root permissions (allow to fail in the case it doesn't exist) diagnosticDir := "$HOME/agent/build/diagnostics" - _, _, _ = sshRunCommand(ctx, c, "sudo", []string{"chown", "-R", "$USER:$USER", diagnosticDir}, nil) - stdOut, _, err := sshRunCommand(ctx, c, "ls", []string{"-1", diagnosticDir}, nil) + _, _, _ = sshClient.Exec(ctx, "sudo", []string{"chown", "-R", "$USER:$USER", diagnosticDir}, nil) + stdOut, _, err := sshClient.Exec(ctx, "ls", []string{"-1", diagnosticDir}, nil) if err != nil { //nolint:nilerr // failed to list the directory, probably don't have any diagnostics (do nothing) return nil @@ -256,7 +254,7 @@ func (DebianRunner) Diagnostics(ctx context.Context, c *ssh.Client, logger Logge if err != nil { return fmt.Errorf("failed to create file %s: %w", dp, err) } - err = sshGetFileContentsOutput(ctx, c, fp, out) + err = sshClient.GetFileContentsOutput(ctx, fp, out) _ = out.Close() if err != nil { return fmt.Errorf("failed to copy file from remote host to %s: %w", dp, err) @@ -265,7 +263,7 @@ func (DebianRunner) Diagnostics(ctx context.Context, c *ssh.Client, logger Logge return nil } -func runTests(ctx context.Context, logger Logger, name string, prefix string, script string, sshClient *ssh.Client, tests []define.BatchPackageTests) ([]OSRunnerPackageResult, error) { +func runTests(ctx context.Context, logger Logger, name string, prefix string, script string, sshClient SSHClient, tests []define.BatchPackageTests) ([]OSRunnerPackageResult, error) { execTest := strings.NewReader(script) session, err := sshClient.NewSession() @@ -298,20 +296,20 @@ func runTests(ctx context.Context, logger Logger, name string, prefix string, sc return result, nil } -func getRunnerPackageResult(ctx context.Context, c *ssh.Client, pkg define.BatchPackageTests, prefix string) (OSRunnerPackageResult, error) { +func getRunnerPackageResult(ctx context.Context, sshClient SSHClient, pkg define.BatchPackageTests, prefix string) (OSRunnerPackageResult, error) { var err error var resultPkg OSRunnerPackageResult resultPkg.Name = pkg.Name outputPath := fmt.Sprintf("$HOME/agent/build/TEST-go-remote-%s.%s", prefix, filepath.Base(pkg.Name)) - resultPkg.Output, err = sshGetFileContents(ctx, c, outputPath+".out") + resultPkg.Output, err = sshClient.GetFileContents(ctx, outputPath+".out") if err != nil { return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.out", outputPath) } - resultPkg.JSONOutput, err = sshGetFileContents(ctx, c, outputPath+".out.json") + resultPkg.JSONOutput, err = sshClient.GetFileContents(ctx, outputPath+".out.json") if err != nil { return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.out.json", outputPath) } - resultPkg.XMLOutput, err = sshGetFileContents(ctx, c, outputPath+".xml") + resultPkg.XMLOutput, err = sshClient.GetFileContents(ctx, outputPath+".xml") if err != nil { return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.xml", outputPath) } diff --git a/pkg/testing/runner/runner.go b/pkg/testing/runner/runner.go index e57073bf491..b319f0738ea 100644 --- a/pkg/testing/runner/runner.go +++ b/pkg/testing/runner/runner.go @@ -61,13 +61,13 @@ type OSRunnerResult struct { // OSRunner provides an interface to run the tests on the OS. type OSRunner interface { // Prepare prepares the runner to actual run on the host. - Prepare(ctx context.Context, c *ssh.Client, logger Logger, arch string, goVersion string) error + Prepare(ctx context.Context, sshClient SSHClient, logger Logger, arch string, goVersion string) error // Copy places the required files on the host. - Copy(ctx context.Context, c *ssh.Client, logger Logger, repoArchive string, build Build) error + Copy(ctx context.Context, sshClient SSHClient, logger Logger, repoArchive string, build Build) error // Run runs the actual tests and provides the result. - Run(ctx context.Context, verbose bool, c *ssh.Client, logger Logger, agentVersion string, prefix string, batch define.Batch, env map[string]string) (OSRunnerResult, error) + Run(ctx context.Context, verbose bool, sshClient SSHClient, logger Logger, agentVersion string, prefix string, batch define.Batch, env map[string]string) (OSRunnerResult, error) // Diagnostics gathers any diagnostics from the host. - Diagnostics(ctx context.Context, c *ssh.Client, logger Logger, destination string) error + Diagnostics(ctx context.Context, sshClient SSHClient, logger Logger, destination string) error } // Logger is a simple logging interface used by each runner type. @@ -322,9 +322,10 @@ func (r *Runner) runInstance(ctx context.Context, sshAuth ssh.AuthMethod, logger } logger.Logf("Starting SSH; connect with `ssh -i %s %s@%s`", sshPrivateKeyPath, instance.Username, instance.IP) + client := NewSSHClient(instance.IP, instance.Username, sshAuth) connectCtx, connectCancel := context.WithTimeout(ctx, 10*time.Minute) defer connectCancel() - client, err := sshConnect(connectCtx, instance.IP, instance.Username, sshAuth) + err = client.Connect(connectCtx) if err != nil { logger.Logf("Failed to connect to instance %s: %s", instance.IP, err) return OSRunnerResult{}, fmt.Errorf("failed to connect to instance %s: %w", instance.Name, err) diff --git a/pkg/testing/runner/ssh.go b/pkg/testing/runner/ssh.go index 9a33a5ad6cc..ebb9fcd96da 100644 --- a/pkg/testing/runner/ssh.go +++ b/pkg/testing/runner/ssh.go @@ -56,42 +56,143 @@ func newSSHPublicKey(pk *rsa.PublicKey) ([]byte, error) { return ssh.MarshalAuthorizedKey(pub), nil } -// sshConnect keeps trying to make the SSH connection up until the context is cancelled -func sshConnect(ctx context.Context, ip string, username string, sshAuth ssh.AuthMethod) (*ssh.Client, error) { +type fileContentsOpts struct { + command string +} + +// FileContentsOpt provides an option to modify how fetching files from the remote host work. +type FileContentsOpt func(opts *fileContentsOpts) + +// WithContentFetchCommand changes the command to use for fetching the file contents. +func WithContentFetchCommand(command string) FileContentsOpt { + return func(opts *fileContentsOpts) { + opts.command = command + } +} + +// SSHClient is a *ssh.Client that provides a nice interface to work with. +type SSHClient interface { + // Connect connects to the host. + Connect(ctx context.Context) error + + // ConnectWithTimeout connects to the host with a timeout. + ConnectWithTimeout(ctx context.Context, timeout time.Duration) error + + // Close closes the client. + Close() error + + // Reconnect disconnects and reconnected to the host. + Reconnect(ctx context.Context) error + + // ReconnectWithTimeout disconnects and reconnected to the host with a timeout. + ReconnectWithTimeout(ctx context.Context, timeout time.Duration) error + + // NewSession opens a new Session for this host. + NewSession() (*ssh.Session, error) + + // Exec runs a command on the host. + Exec(ctx context.Context, cmd string, args []string, stdin io.Reader) ([]byte, []byte, error) + + // ExecWithRetry runs the command on loop waiting the interval between calls + ExecWithRetry(ctx context.Context, cmd string, args []string, interval time.Duration) ([]byte, []byte, error) + + // Copy copies the filePath to the host at dest. + Copy(filePath string, dest string) error + + // GetFileContents returns the file content. + GetFileContents(ctx context.Context, filename string, opts ...FileContentsOpt) ([]byte, error) + + // GetFileContentsOutput returns the file content writing to output. + GetFileContentsOutput(ctx context.Context, filename string, output io.Writer, opts ...FileContentsOpt) error +} + +type sshClient struct { + ip string + username string + auth ssh.AuthMethod + + c *ssh.Client +} + +// NewSSHClient creates a new SSH client connection to the host. +func NewSSHClient(ip string, username string, sshAuth ssh.AuthMethod) SSHClient { + return &sshClient{ + ip: ip, + username: username, + auth: sshAuth, + } +} + +// Connect connects to the host. +func (s *sshClient) Connect(ctx context.Context) error { var lastErr error for { if ctx.Err() != nil { if lastErr == nil { - return nil, ctx.Err() + return ctx.Err() } - return nil, lastErr + return lastErr } config := &ssh.ClientConfig{ - User: username, + User: s.username, HostKeyCallback: ssh.InsecureIgnoreHostKey(), //nolint:gosec // it's the tests framework test - Auth: []ssh.AuthMethod{sshAuth}, + Auth: []ssh.AuthMethod{s.auth}, Timeout: 30 * time.Second, } - client, err := ssh.Dial("tcp", net.JoinHostPort(ip, "22"), config) + client, err := ssh.Dial("tcp", net.JoinHostPort(s.ip, "22"), config) if err == nil { - return client, nil + s.c = client + return nil } lastErr = err } } -// sshRunCommand runs a command on the SSH client connection. -// It returns stdout, stderr and an error if any. stdout and stderr might be nil -// if an error happens before running cmd. -func sshRunCommand(ctx context.Context, c *ssh.Client, cmd string, args []string, stdin io.Reader) ([]byte, []byte, error) { +// ConnectWithTimeout connects to the host with a timeout. +func (s *sshClient) ConnectWithTimeout(ctx context.Context, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + return s.Connect(ctx) +} + +// Close closes the client. +func (s *sshClient) Close() error { + if s.c != nil { + err := s.c.Close() + s.c = nil + return err + } + return nil +} + +// Reconnect disconnects and reconnected to the host. +func (s *sshClient) Reconnect(ctx context.Context) error { + _ = s.Close() + return s.Connect(ctx) +} + +// ReconnectWithTimeout disconnects and reconnected to the host with a timeout. +func (s *sshClient) ReconnectWithTimeout(ctx context.Context, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + return s.Reconnect(ctx) +} + +// NewSession opens a new Session for this host. +func (s *sshClient) NewSession() (*ssh.Session, error) { + return s.c.NewSession() +} + +// Exec runs a command on the host. +func (s *sshClient) Exec(ctx context.Context, cmd string, args []string, stdin io.Reader) ([]byte, []byte, error) { if ctx.Err() != nil { - return nil, nil, fmt.Errorf("sshRunCommand: %w", ctx.Err()) + return nil, nil, ctx.Err() } cmdArgs := []string{cmd} cmdArgs = append(cmdArgs, args...) cmdStr := strings.Join(cmdArgs, " ") - session, err := c.NewSession() + session, err := s.NewSession() if err != nil { return nil, nil, fmt.Errorf("could not create new SSH session: %w", err) } @@ -109,19 +210,18 @@ func sshRunCommand(ctx context.Context, c *ssh.Client, cmd string, args []string return stdout.Bytes(), stderr.Bytes(), fmt.Errorf("could not run %q though SSH: %w", cmdStr, err) } - - return stdout.Bytes(), stderr.Bytes(), nil + return stdout.Bytes(), stderr.Bytes(), err } -// sshRunCommandWithRetry runs the command on loop waiting the interval between calls -func sshRunCommandWithRetry(ctx context.Context, c *ssh.Client, cmd string, args []string, interval time.Duration) ([]byte, []byte, error) { +// ExecWithRetry runs the command on loop waiting the interval between calls +func (s *sshClient) ExecWithRetry(ctx context.Context, cmd string, args []string, interval time.Duration) ([]byte, []byte, error) { var lastErr error var lastStdout []byte var lastStderr []byte for { // the length of time for running the command is not blocked on the interval // don't create a new context with the interval as its timeout - stdout, stderr, err := sshRunCommand(ctx, c, cmd, args, nil) + stdout, stderr, err := s.Exec(ctx, cmd, args, nil) if err == nil { return stdout, stderr, nil } @@ -141,19 +241,19 @@ func sshRunCommandWithRetry(ctx context.Context, c *ssh.Client, cmd string, args } } -// sshSCP copies the filePath to the destination. -func sshSCP(c *ssh.Client, filePath string, dest string) error { +// Copy copies the filePath to the host at dest. +func (s *sshClient) Copy(filePath string, dest string) error { f, err := os.Open(filePath) if err != nil { return err } defer f.Close() - s, err := f.Stat() + fs, err := f.Stat() if err != nil { return err } - session, err := c.NewSession() + session, err := s.NewSession() if err != nil { return err } @@ -175,7 +275,7 @@ func sshSCP(c *ssh.Client, filePath string, dest string) error { errCh <- session.Wait() }() - _, err = fmt.Fprintf(w, "C%#o %d %s\n", s.Mode().Perm(), s.Size(), dest) + _, err = fmt.Fprintf(w, "C%#o %d %s\n", fs.Mode().Perm(), fs.Size(), dest) if err != nil { _ = w.Close() <-errCh @@ -192,41 +292,36 @@ func sshSCP(c *ssh.Client, filePath string, dest string) error { return <-errCh } -// sshGetFileContents returns the file content. -func sshGetFileContents(ctx context.Context, c *ssh.Client, filename string) ([]byte, error) { - if ctx.Err() != nil { - return nil, ctx.Err() - } - - session, err := c.NewSession() - if err != nil { - return nil, err - } - defer session.Close() - +// GetFileContents returns the file content. +func (s *sshClient) GetFileContents(ctx context.Context, filename string, opts ...FileContentsOpt) ([]byte, error) { var stdout bytes.Buffer - session.Stdout = &stdout - err = session.Run(fmt.Sprintf("cat %s", filename)) + err := s.GetFileContentsOutput(ctx, filename, &stdout, opts...) if err != nil { return nil, err } return stdout.Bytes(), nil } -// sshGetFileContentsOutput writes the file contents into output. -func sshGetFileContentsOutput(ctx context.Context, c *ssh.Client, filename string, output io.Writer) error { +// GetFileContentsOutput returns the file content writing into output. +func (s *sshClient) GetFileContentsOutput(ctx context.Context, filename string, output io.Writer, opts ...FileContentsOpt) error { if ctx.Err() != nil { return ctx.Err() } - session, err := c.NewSession() + var fco fileContentsOpts + fco.command = "cat" + for _, opt := range opts { + opt(&fco) + } + + session, err := s.NewSession() if err != nil { return err } defer session.Close() session.Stdout = output - err = session.Run(fmt.Sprintf("cat %s", filename)) + err = session.Run(fmt.Sprintf("%s %s", fco.command, filename)) if err != nil { return err } diff --git a/pkg/testing/runner/supported.go b/pkg/testing/runner/supported.go index 9869ed126f7..e08814f4187 100644 --- a/pkg/testing/runner/supported.go +++ b/pkg/testing/runner/supported.go @@ -12,9 +12,6 @@ import ( ) const ( - // Google is for the Google Cloud Platform (GCP) - Google = "google" - // Ubuntu is a Linux distro. Ubuntu = "ubuntu" ) @@ -73,6 +70,60 @@ var ( }, Runner: DebianRunner{}, } + // WindowsAMD64_2022 - Windows (amd64) Server 2022 + WindowsAMD64_2022 = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2022", + }, + Runner: WindowsRunner{}, + } + // WindowsAMD64_2022_Core - Windows (amd64) Server 2022 Core + WindowsAMD64_2022_Core = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2022-core", + }, + Runner: WindowsRunner{}, + } + // WindowsAMD64_2019 - Windows (amd64) Server 2019 + WindowsAMD64_2019 = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2019", + }, + Runner: WindowsRunner{}, + } + // WindowsAMD64_2019_Core - Windows (amd64) Server 2019 Core + WindowsAMD64_2019_Core = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2019-core", + }, + Runner: WindowsRunner{}, + } + // WindowsAMD64_2016 - Windows (amd64) Server 2016 + WindowsAMD64_2016 = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2016", + }, + Runner: WindowsRunner{}, + } + // WindowsAMD64_2016_Core - Windows (amd64) Server 2016 Core + WindowsAMD64_2016_Core = SupportedOS{ + OS: define.OS{ + Type: define.Windows, + Arch: define.AMD64, + Version: "2016-core", + }, + Runner: WindowsRunner{}, + } ) // supported defines the set of supported OS's. @@ -88,6 +139,12 @@ var supported = []SupportedOS{ UbuntuAMD64_2004, UbuntuARM64_2204, UbuntuARM64_2004, + WindowsAMD64_2022, + WindowsAMD64_2022_Core, + WindowsAMD64_2019, + WindowsAMD64_2019_Core, + WindowsAMD64_2016, + WindowsAMD64_2016_Core, } // osMatch returns true when the specific OS is a match for a non-specific OS. @@ -154,7 +211,7 @@ func allowedByPlatform(os define.OS, platform define.OS) bool { return false } } - if os.Version == "" { + if platform.Version == "" { // not specific on version return true } diff --git a/pkg/testing/runner/windows.go b/pkg/testing/runner/windows.go new file mode 100644 index 00000000000..8180ff9a4f3 --- /dev/null +++ b/pkg/testing/runner/windows.go @@ -0,0 +1,324 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package runner + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "strings" + "time" + + "github.com/elastic/elastic-agent/pkg/testing/define" +) + +// WindowsRunner is a handler for running tests on Windows +type WindowsRunner struct{} + +// Prepare the test +func (WindowsRunner) Prepare(ctx context.Context, sshClient SSHClient, logger Logger, arch string, goVersion string) error { + // install chocolatey + logger.Logf("Installing chocolatey") + chocoInstall := `"[System.Net.ServicePointManager]::SecurityProtocol = 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))"` + stdOut, errOut, err := sshRunPowershell(ctx, sshClient, chocoInstall) + if err != nil { + return fmt.Errorf("failed to install chocolatey: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + // reconnect to get updated environment variables (1 minute as it should be quick to reconnect) + err = sshClient.ReconnectWithTimeout(ctx, 1*time.Minute) + if err != nil { + return fmt.Errorf("failed to reconnect: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + + // install curl + logger.Logf("Installing curl") + stdOut, errOut, err = sshClient.Exec(ctx, "choco", []string{"install", "-y", "curl"}, nil) + if err != nil { + return fmt.Errorf("failed to install curl: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + // install make + logger.Logf("Installing make") + stdOut, errOut, err = sshClient.Exec(ctx, "choco", []string{"install", "-y", "make"}, nil) + if err != nil { + return fmt.Errorf("failed to install make: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + + // install golang (doesn't use choco, because sometimes it doesn't have the required version) + logger.Logf("Installing golang %s (%s)", goVersion, arch) + downloadURL := fmt.Sprintf("https://go.dev/dl/go%s.windows-%s.msi", goVersion, arch) + filename := path.Base(downloadURL) + stdOut, errOut, err = sshClient.Exec(ctx, "curl", []string{"-Ls", downloadURL, "--output", filename}, nil) + if err != nil { + return fmt.Errorf("failed to download go from %s with curl: %w (stdout: %s, stderr: %s)", downloadURL, err, stdOut, errOut) + } + stdOut, errOut, err = sshClient.Exec(ctx, "msiexec", []string{"/i", filename, "/qn"}, nil) + if err != nil { + return fmt.Errorf("failed to install go: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + // reconnect to get updated environment variables (1 minute as it should be quick to reconnect) + err = sshClient.ReconnectWithTimeout(ctx, 1*time.Minute) + if err != nil { + return fmt.Errorf("failed to reconnect: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + + return nil +} + +// Copy places the required files on the host. +func (WindowsRunner) Copy(ctx context.Context, sshClient SSHClient, logger Logger, repoArchive string, build Build) error { + // copy the archive and extract it on the host (tar exists and can extract zip on windows) + logger.Logf("Copying repo") + destRepoName := filepath.Base(repoArchive) + err := sshClient.Copy(repoArchive, destRepoName) + if err != nil { + return fmt.Errorf("failed to SCP repo archive %s: %w", repoArchive, err) + } + + // ensure that agent directory is removed (possible it already exists if instance already used) + // Windows errors if the directory doesn't exist, it's okay if it doesn't ignore any error here + _, _, _ = sshClient.Exec(ctx, "rmdir", []string{"agent", "/s", "/q"}, nil) + + stdOut, errOut, err := sshClient.Exec(ctx, "mkdir", []string{"agent"}, nil) + if err != nil { + return fmt.Errorf("failed to mkdir agent: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + stdOut, errOut, err = sshClient.Exec(ctx, "tar", []string{"-xf", destRepoName, "-C", "agent"}, nil) + if err != nil { + return fmt.Errorf("failed to unzip %s to agent directory: %w (stdout: %s, stderr: %s)", destRepoName, err, stdOut, errOut) + } + + // install mage and prepare for testing + logger.Logf("Running make mage and prepareOnRemote") + stdOut, errOut, err = sshClient.Exec(ctx, "cd", []string{"agent", "&&", "make", "mage", "&&", "mage", "integration:prepareOnRemote"}, nil) + if err != nil { + return fmt.Errorf("failed to to perform make mage and prepareOnRemote: %w (stdout: %s, stderr: %s)", err, stdOut, errOut) + } + + // determine if the build needs to be replaced on the host + // if it already exists and the SHA512 are the same contents, then + // there is no reason to waste time uploading the build + copyBuild := true + localSHA512, err := os.ReadFile(build.SHA512Path) + if err != nil { + return fmt.Errorf("failed to read local SHA52 contents %s: %w", build.SHA512Path, err) + } + hostSHA512Path := filepath.Base(build.SHA512Path) + hostSHA512, err := sshClient.GetFileContents(ctx, hostSHA512Path, WithContentFetchCommand("type")) + if err == nil { + if string(localSHA512) == string(hostSHA512) { + logger.Logf("Skipping copy agent build %s; already the same", filepath.Base(build.Path)) + copyBuild = false + } + } + + if copyBuild { + // ensure the existing copies are removed first + toRemove := filepath.Base(build.Path) + stdOut, errOut, err = sshClient.Exec(ctx, + "del", []string{toRemove, "/f", "/q"}, nil) + if err != nil { + return fmt.Errorf("failed to remove %q: %w (stdout: %q, stderr: %q)", + toRemove, err, stdOut, errOut) + } + + toRemove = filepath.Base(build.SHA512Path) + stdOut, errOut, err = sshClient.Exec(ctx, + "del", []string{toRemove, "/f", "/q"}, nil) + if err != nil { + return fmt.Errorf("failed to remove %q: %w (stdout: %q, stderr: %q)", + toRemove, err, stdOut, errOut) + } + + logger.Logf("Copying agent build %s", filepath.Base(build.Path)) + } + + for _, buildPath := range []string{build.Path, build.SHA512Path} { + if copyBuild { + err = sshClient.Copy(buildPath, filepath.Base(buildPath)) + if err != nil { + return fmt.Errorf("failed to SCP build %s: %w", filepath.Base(buildPath), err) + } + } + insideAgentDir := filepath.Join("agent", buildPath) + // possible the build path already exists, 'mkdir' on windows will fail if it already exists + // error from this call is ignored because of it + _, _, _ = sshClient.Exec(ctx, "mkdir", []string{toWindowsPath(filepath.Dir(insideAgentDir))}, nil) + stdOut, errOut, err = sshClient.Exec(ctx, "mklink", []string{"/h", toWindowsPath(insideAgentDir), filepath.Base(buildPath)}, nil) + if err != nil { + return fmt.Errorf("failed to hard link %s to %s: %w (stdout: %s, stderr: %s)", filepath.Base(buildPath), toWindowsPath(insideAgentDir), err, stdOut, errOut) + } + } + + return nil +} + +// Run the test +func (WindowsRunner) Run(ctx context.Context, verbose bool, c SSHClient, logger Logger, agentVersion string, prefix string, batch define.Batch, env map[string]string) (OSRunnerResult, error) { + var tests []string + for _, pkg := range batch.Tests { + for _, test := range pkg.Tests { + tests = append(tests, fmt.Sprintf("%s:%s", pkg.Name, test.Name)) + } + } + var sudoTests []string + for _, pkg := range batch.SudoTests { + for _, test := range pkg.Tests { + sudoTests = append(sudoTests, fmt.Sprintf("%s:%s", pkg.Name, test.Name)) + } + } + + var result OSRunnerResult + if len(tests) > 0 { + script := toPowershellScript(agentVersion, prefix, verbose, tests, env) + + results, err := runTestsOnWindows(ctx, logger, "non-sudo", prefix, script, c, batch.SudoTests) + if err != nil { + return OSRunnerResult{}, fmt.Errorf("error running non-sudo tests: %w", err) + } + result.Packages = results + } + + if len(sudoTests) > 0 { + prefix := fmt.Sprintf("%s-sudo", prefix) + script := toPowershellScript(agentVersion, prefix, verbose, sudoTests, env) + + results, err := runTestsOnWindows(ctx, logger, "sudo", prefix, script, c, batch.SudoTests) + if err != nil { + return OSRunnerResult{}, fmt.Errorf("error running sudo tests: %w", err) + } + result.SudoPackages = results + + } + return result, nil +} + +// Diagnostics gathers any diagnostics from the host. +func (WindowsRunner) Diagnostics(ctx context.Context, sshClient SSHClient, logger Logger, destination string) error { + diagnosticDir := "agent\\build\\diagnostics" + stdOut, _, err := sshClient.Exec(ctx, "dir", []string{diagnosticDir, "/b"}, nil) + if err != nil { + //nolint:nilerr // failed to list the directory, probably don't have any diagnostics (do nothing) + return nil + } + eachDiagnostic := strings.Split(string(stdOut), "\n") + for _, filename := range eachDiagnostic { + filename = strings.TrimSpace(filename) + if filename == "" { + continue + } + + // don't use filepath.Join as we need this to work in Linux/Darwin as well + // this is because if we use `filepath.Join` on a Linux/Darwin host connected to a Windows host + // it will use a `/` and that will be incorrect for Windows + fp := fmt.Sprintf("%s\\%s", diagnosticDir, filename) + // use filepath.Join on this path because it's a path on this specific host platform + dp := filepath.Join(destination, filename) + logger.Logf("Copying diagnostic %s", filename) + out, err := os.Create(dp) + if err != nil { + return fmt.Errorf("failed to create file %s: %w", dp, err) + } + err = sshClient.GetFileContentsOutput(ctx, fp, out, WithContentFetchCommand("type")) + _ = out.Close() + if err != nil { + return fmt.Errorf("failed to copy file from remote host to %s: %w", dp, err) + } + } + return nil +} + +func sshRunPowershell(ctx context.Context, sshClient SSHClient, cmd string) ([]byte, []byte, error) { + return sshClient.Exec(ctx, "powershell", []string{ + "-NoProfile", + "-InputFormat", "None", + "-ExecutionPolicy", "Bypass", + "-Command", cmd, + }, nil) +} + +func toPowershellScript(agentVersion string, prefix string, verbose bool, tests []string, env map[string]string) string { + var sb strings.Builder + for k, v := range env { + sb.WriteString("$env:") + sb.WriteString(k) + sb.WriteString("=\"") + sb.WriteString(v) + sb.WriteString("\"\n") + } + sb.WriteString("$env:AGENT_VERSION=\"") + sb.WriteString(agentVersion) + sb.WriteString("\"\n") + sb.WriteString("$env:TEST_DEFINE_PREFIX=\"") + sb.WriteString(prefix) + sb.WriteString("\"\n") + sb.WriteString("$env:TEST_DEFINE_TESTS=\"") + sb.WriteString(strings.Join(tests, ",")) + sb.WriteString("\"\n") + sb.WriteString("cd agent\n") + sb.WriteString("mage ") + if verbose { + sb.WriteString("-v ") + } + sb.WriteString("integration:testOnRemote\n") + return sb.String() +} + +func runTestsOnWindows(ctx context.Context, logger Logger, name string, prefix string, script string, sshClient SSHClient, tests []define.BatchPackageTests) ([]OSRunnerPackageResult, error) { + execTest := strings.NewReader(script) + + session, err := sshClient.NewSession() + if err != nil { + return nil, fmt.Errorf("failed to start session: %w", err) + } + + session.Stdout = newPrefixOutput(logger, fmt.Sprintf("Test output (%s) (stdout): ", name)) + session.Stderr = newPrefixOutput(logger, fmt.Sprintf("Test output (%s) (stderr): ", name)) + session.Stdin = execTest + // allowed to fail because tests might fail + logger.Logf("Running %s tests...", name) + err = session.Run("powershell -noprofile -noninteractive -") + if err != nil { + logger.Logf("%s tests failed: %s", name, err) + } + // this seems to always return an error + _ = session.Close() + + var result []OSRunnerPackageResult + // fetch the contents for each package + for _, pkg := range tests { + resultPkg, err := getWindowsRunnerPackageResult(ctx, sshClient, pkg, prefix) + if err != nil { + return nil, err + } + result = append(result, resultPkg) + } + return result, nil +} + +func toWindowsPath(path string) string { + return strings.ReplaceAll(path, "/", "\\") +} + +func getWindowsRunnerPackageResult(ctx context.Context, sshClient SSHClient, pkg define.BatchPackageTests, prefix string) (OSRunnerPackageResult, error) { + var err error + var resultPkg OSRunnerPackageResult + resultPkg.Name = pkg.Name + outputPath := fmt.Sprintf("%%home%%\\agent\\build\\TEST-go-remote-%s.%s", prefix, filepath.Base(pkg.Name)) + resultPkg.Output, err = sshClient.GetFileContents(ctx, outputPath+".out", WithContentFetchCommand("type")) + if err != nil { + return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.out", outputPath) + } + resultPkg.JSONOutput, err = sshClient.GetFileContents(ctx, outputPath+".out.json", WithContentFetchCommand("type")) + if err != nil { + return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.out.json", outputPath) + } + resultPkg.XMLOutput, err = sshClient.GetFileContents(ctx, outputPath+".xml", WithContentFetchCommand("type")) + if err != nil { + return OSRunnerPackageResult{}, fmt.Errorf("failed to fetched test output at %s.xml", outputPath) + } + return resultPkg, nil +} diff --git a/pkg/testing/tools/cmd.go b/pkg/testing/tools/cmd.go index 4330b1504d1..9277c3ea5c6 100644 --- a/pkg/testing/tools/cmd.go +++ b/pkg/testing/tools/cmd.go @@ -6,20 +6,27 @@ package tools import ( "context" + "time" atesting "github.com/elastic/elastic-agent/pkg/testing" ) // InstallAgent force install the Elastic Agent through agentFixture. -func InstallAgent(installOpts atesting.InstallOpts, agentFixture *atesting.Fixture) ([]byte, error) { - return agentFixture.Install(context.Background(), &installOpts) +func InstallAgent(ctx context.Context, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture) ([]byte, error) { + // 2 minute timeout, to ensure that it at least doesn't get stuck. + ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + return agentFixture.Install(ctx, &installOpts) } // InstallStandaloneAgent force install the Elastic Agent through agentFixture. -func InstallStandaloneAgent(agentFixture *atesting.Fixture) ([]byte, error) { +func InstallStandaloneAgent(ctx context.Context, agentFixture *atesting.Fixture) ([]byte, error) { + // 2 minute timeout, to ensure that it at least doesn't get stuck. + ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() installOpts := atesting.InstallOpts{ NonInteractive: true, Force: true, } - return agentFixture.Install(context.Background(), &installOpts) + return agentFixture.Install(ctx, &installOpts) } diff --git a/pkg/testing/tools/tools.go b/pkg/testing/tools/tools.go index d6da4f5c7f4..e894a47a825 100644 --- a/pkg/testing/tools/tools.go +++ b/pkg/testing/tools/tools.go @@ -54,7 +54,11 @@ func WaitForPolicyRevision(t *testing.T, client *kibana.Client, agentID string, // InstallAgentWithPolicy creates the given policy, enrolls the given agent // fixture in Fleet using the default Fleet Server, waits for the agent to be // online, and returns the created policy. +<<<<<<< HEAD func InstallAgentWithPolicy(t *testing.T, agentFixture *atesting.Fixture, kibClient *kibana.Client, createPolicyReq kibana.AgentPolicy) (*kibana.PolicyResponse, error) { +======= +func InstallAgentWithPolicy(ctx context.Context, t *testing.T, installOpts atesting.InstallOpts, agentFixture *atesting.Fixture, kibClient *kibana.Client, createPolicyReq kibana.AgentPolicy) (kibana.PolicyResponse, error) { +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) t.Helper() policy, err := kibClient.CreatePolicy(context.TODO(), createPolicyReq) @@ -62,6 +66,45 @@ func InstallAgentWithPolicy(t *testing.T, agentFixture *atesting.Fixture, kibCli return nil, fmt.Errorf("unable to create policy: %w", err) } +<<<<<<< HEAD +======= + if createPolicyReq.IsProtected { + // If protected fetch uninstall token and set it for the fixture + resp, err := kibClient.GetPolicyUninstallTokens(ctx, policy.ID) + if err != nil { + return policy, fmt.Errorf("failed to fetch uninstal tokens: %w", err) + } + if len(resp.Items) == 0 { + return policy, fmt.Errorf("expected non-zero number of tokens: %w", err) + } + + if len(resp.Items[0].Token) == 0 { + return policy, fmt.Errorf("expected non-empty token: %w", err) + } + + uninstallToken := resp.Items[0].Token + t.Logf("Protected with uninstall token: %v", uninstallToken) + agentFixture.SetUninstallToken(uninstallToken) + } + + err = InstallAgentForPolicy(ctx, t, installOpts, agentFixture, kibClient, policy.ID) + return policy, err +} + +// InstallAgentForPolicy enrolls the provided agent fixture in Fleet using the +// default Fleet Server, waits for the agent to come online, and returns either +// an error or nil. +// If the context (ctx) has a deadline, it will wait for the agent to become +// online until the deadline of the context, or if not, a default 5-minute +// deadline will be applied. +func InstallAgentForPolicy(ctx context.Context, t *testing.T, + installOpts atesting.InstallOpts, + agentFixture *atesting.Fixture, + kibClient *kibana.Client, + policyID string) error { + t.Helper() + +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) // Create enrollment API key createEnrollmentAPIKeyReq := kibana.CreateEnrollmentAPIKeyRequest{ PolicyID: policy.ID, @@ -91,7 +134,7 @@ func InstallAgentWithPolicy(t *testing.T, agentFixture *atesting.Fixture, kibCli URL: fleetServerURL, EnrollmentToken: enrollmentToken.APIKey, } - output, err := InstallAgent(installOpts, agentFixture) + output, err := InstallAgent(ctx, installOpts, agentFixture) if err != nil { t.Log(string(output)) return nil, fmt.Errorf("unable to enroll Elastic Agent: %w", err) diff --git a/testing/integration/endpoint_security_test.go b/testing/integration/endpoint_security_test.go index 15228e1c3a8..03669c69788 100644 --- a/testing/integration/endpoint_security_test.go +++ b/testing/integration/endpoint_security_test.go @@ -76,7 +76,19 @@ func TestInstallAndCLIUninstallWithEndpointSecurity(t *testing.T) { kibana.MonitoringEnabledMetrics, }, } +<<<<<<< HEAD policyResp, err := tools.InstallAgentWithPolicy(t, fixture, info.KibanaClient, createPolicyReq) +======= + + policy, err := tools.InstallAgentWithPolicy(ctx, t, + installOpts, fixture, info.KibanaClient, createPolicyReq) + require.NoError(t, err, "failed to install agent with policy") + + t.Cleanup(func() { + t.Log("Un-enrolling Elastic Agent...") + assert.NoError(t, tools.UnEnrollAgent(info.KibanaClient, policy.ID)) + }) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) t.Log("Installing Elastic Defend") installElasticDefendPackage(t, info, policyResp.ID) @@ -132,7 +144,16 @@ func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) { kibana.MonitoringEnabledMetrics, }, } +<<<<<<< HEAD policyResp, err := tools.InstallAgentWithPolicy(t, fixture, info.KibanaClient, createPolicyReq) +======= + + ctx, cn := context.WithCancel(context.Background()) + defer cn() + + policy, err := tools.InstallAgentWithPolicy(ctx, t, installOpts, fixture, info.KibanaClient, createPolicyReq) + require.NoError(t, err) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) t.Log("Installing Elastic Defend") installElasticDefendPackage(t, info, policyResp.ID) @@ -214,6 +235,128 @@ func TestInstallAndUnenrollWithEndpointSecurity(t *testing.T) { } } +<<<<<<< HEAD +======= +// Tests that the agent can install and uninstall the endpoint-security service +// after the Elastic Defend integration was removed from the policy +// while remaining healthy. +// +// Installing endpoint-security requires a Fleet managed agent with the Elastic Defend integration +// installed. The endpoint-security service is uninstalled the Elastic Defend integration was removed from the policy. +// +// Like the CLI uninstall test, the agent is uninstalled from the command line at the end of the test +// but at this point endpoint should be already uninstalled. + +func TestInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, // requires Agent installation + Isolate: false, + Sudo: true, // requires Agent installation + OS: []define.OS{ + {Type: define.Linux}, + }, + }) + + for _, tc := range protectionTests { + t.Run(tc.name, func(t *testing.T) { + testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t, info, tc.protected) + }) + } +} + +func testInstallWithEndpointSecurityAndRemoveEndpointIntegration(t *testing.T, info *define.Info, protected bool) { + // Get path to agent executable. + fixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + + t.Log("Enrolling the agent in Fleet") + policyUUID := uuid.New().String() + createPolicyReq := buildPolicyWithTamperProtection( + kibana.AgentPolicy{ + Name: "test-policy-" + policyUUID, + Namespace: "default", + Description: "Test policy " + policyUUID, + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + }, + protected, + ) + + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + } + + ctx, cn := context.WithCancel(context.Background()) + defer cn() + + policy, err := tools.InstallAgentWithPolicy(ctx, t, installOpts, fixture, info.KibanaClient, createPolicyReq) + require.NoError(t, err) + + t.Log("Installing Elastic Defend") + pkgPolicyResp, err := installElasticDefendPackage(t, info, policy.ID) + require.NoErrorf(t, err, "Policy Response was: %#v", pkgPolicyResp) + + t.Log("Polling for endpoint-security to become Healthy") + ctx, cancel := context.WithTimeout(context.Background(), endpointHealthPollingTimeout) + defer cancel() + + agentClient := fixture.Client() + err = agentClient.Connect(ctx) + require.NoError(t, err) + + require.Eventually(t, + func() bool { return agentAndEndpointAreHealthy(t, ctx, agentClient) }, + endpointHealthPollingTimeout, + time.Second, + "Endpoint component or units are not healthy.", + ) + t.Log("Verified endpoint component and units are healthy") + + t.Logf("Removing Elastic Defend: %v", fmt.Sprintf("/api/fleet/package_policies/%v", pkgPolicyResp.Item.ID)) + _, err = info.KibanaClient.DeleteFleetPackage(ctx, pkgPolicyResp.Item.ID) + require.NoError(t, err) + + t.Log("Waiting for endpoint to stop") + require.Eventually(t, + func() bool { return agentIsHealthyNoEndpoint(t, ctx, agentClient) }, + endpointHealthPollingTimeout, + time.Second, + "Endpoint component or units are still present.", + ) + t.Log("Verified endpoint component and units are removed") + + // Verify that the Endpoint directory was correctly removed. + // Regression test for https://github.com/elastic/elastic-agent/issues/3077 + agentInstallPath := fixture.WorkDir() + files, err := os.ReadDir(filepath.Clean(filepath.Join(agentInstallPath, ".."))) + require.NoError(t, err) + + t.Logf("Checking directories at install path %s", agentInstallPath) + for _, f := range files { + if !f.IsDir() { + continue + } + + t.Log("Found directory", f.Name()) + require.False(t, strings.Contains(f.Name(), "Endpoint"), "Endpoint directory was not removed") + } +} + +// This is a subset of kibana.AgentPolicyUpdateRequest, using until elastic-agent-libs PR https://github.com/elastic/elastic-agent-libs/pull/141 is merged +// TODO: replace with the elastic-agent-libs when available +type agentPolicyUpdateRequest struct { + // Name of the policy. Required in an update request. + Name string `json:"name"` + // Namespace of the policy. Required in an update request. + Namespace string `json:"namespace"` + IsProtected bool `json:"is_protected"` +} + +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) // Installs the Elastic Defend package to cause the agent to install the endpoint-security service. func installElasticDefendPackage(t *testing.T, info *define.Info, policyID string) { t.Helper() @@ -244,7 +387,63 @@ func installElasticDefendPackage(t *testing.T, info *define.Info, policyID strin pkgResp, err := tools.InstallFleetPackage(ctx, info.KibanaClient, &packagePolicyReq) require.NoError(t, err) +<<<<<<< HEAD t.Logf("Endpoint package Policy Response:\n%+v", pkgResp) +======= + + t.Log("Enrolling the agent in Fleet") + policyUUID := uuid.New().String() + createPolicyReq := kibana.AgentPolicy{ + Name: "test-policy-" + policyUUID, + Namespace: "default", + Description: "Test policy " + policyUUID, + MonitoringEnabled: []kibana.MonitoringEnabledOption{ + kibana.MonitoringEnabledLogs, + kibana.MonitoringEnabledMetrics, + }, + } + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + BasePath: filepath.Join(paths.DefaultBasePath, "not_default"), + } + policyResp, err := tools.InstallAgentWithPolicy(ctx, t, installOpts, fixture, info.KibanaClient, createPolicyReq) + require.NoErrorf(t, err, "Policy Response was: %v", policyResp) + + t.Log("Installing Elastic Defend") + pkgPolicyResp, err := installElasticDefendPackage(t, info, policyResp.ID) + require.NoErrorf(t, err, "Policy Response was: %v", pkgPolicyResp) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + c := fixture.Client() + + require.Eventually(t, func() bool { + err := c.Connect(ctx) + if err != nil { + t.Logf("connecting client to agent: %v", err) + return false + } + defer c.Disconnect() + state, err := c.State(ctx) + if err != nil { + t.Logf("error getting the agent state: %v", err) + return false + } + t.Logf("agent state: %+v", state) + if state.State != cproto.State_DEGRADED { + return false + } + for _, c := range state.Components { + if strings.Contains(c.Message, + "Elastic Defend requires Elastic Agent be installed at the default installation path") { + return true + } + } + return false + }, 2*time.Minute, 10*time.Second, "Agent never became DEGRADED with default install message") +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) } func agentAndEndpointAreHealthy(t *testing.T, ctx context.Context, agentClient client.Client) bool { diff --git a/testing/integration/enroll_test.go b/testing/integration/enroll_test.go index fe06c0d8ffa..a5f8d6f9274 100644 --- a/testing/integration/enroll_test.go +++ b/testing/integration/enroll_test.go @@ -78,7 +78,16 @@ func (runner *EnrollRunner) TestEnroll() { } // Stage 1: Install // As part of the cleanup process, we'll uninstall the agent +<<<<<<< HEAD:testing/integration/enroll_test.go policy, err := tools.InstallAgentWithPolicy(t, runner.agentFixture, kibClient, createPolicyReq) +======= + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + } + policy, err := tools.InstallAgentWithPolicy(ctx, t, + installOpts, agentFixture, info.KibanaClient, createPolicyReq) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)):testing/integration/monitoring_logs_test.go require.NoError(t, err) t.Logf("created policy: %s", policy.ID) diff --git a/testing/integration/fqdn_test.go b/testing/integration/fqdn_test.go index d112a07c417..c3386cb74ff 100644 --- a/testing/integration/fqdn_test.go +++ b/testing/integration/fqdn_test.go @@ -93,7 +93,15 @@ func TestFQDN(t *testing.T) { }, }, } +<<<<<<< HEAD policy, err := tools.InstallAgentWithPolicy(t, agentFixture, kibClient, createPolicyReq) +======= + installOpts := atesting.InstallOpts{ + NonInteractive: true, + Force: true, + } + policy, err := tools.InstallAgentWithPolicy(ctx, t, installOpts, agentFixture, kibClient, createPolicyReq) +>>>>>>> 35dbbdea9b (Add Windows support to integration testing runner (#2941)) require.NoError(t, err) t.Log("Verify that agent name is short hostname") diff --git a/testing/integration/install_test.go b/testing/integration/install_test.go index 556a4bdd776..5ee21ac33dd 100644 --- a/testing/integration/install_test.go +++ b/testing/integration/install_test.go @@ -58,8 +58,11 @@ func TestInstallWithoutBasePath(t *testing.T) { // Run `elastic-agent install`. We use `--force` to prevent interactive // execution. - _, err = fixture.Install(context.Background(), &atesting.InstallOpts{Force: true}) - require.NoError(t, err) + out, err := fixture.Install(context.Background(), &atesting.InstallOpts{Force: true}) + if err != nil { + t.Logf("install output: %s", out) + require.NoError(t, err) + } // Check that Agent was installed in default base path checkInstallSuccess(t, topPath) @@ -100,11 +103,14 @@ func TestInstallWithBasePath(t *testing.T) { // Run `elastic-agent install`. We use `--force` to prevent interactive // execution. - _, err = fixture.Install(context.Background(), &atesting.InstallOpts{ + out, err := fixture.Install(context.Background(), &atesting.InstallOpts{ BasePath: randomBasePath, Force: true, }) - require.NoError(t, err) + if err != nil { + t.Logf("install output: %s", out) + require.NoError(t, err) + } // Check that Agent was installed in the custom base path topPath := filepath.Join(randomBasePath, "Elastic", "Agent") @@ -117,7 +123,7 @@ func checkInstallSuccess(t *testing.T, topPath string) { require.NoError(t, err) // Check that a few expected installed files are present - installedBinPath := filepath.Join(topPath, "elastic-agent") + installedBinPath := filepath.Join(topPath, exeOnWindows("elastic-agent")) installedDataPath := filepath.Join(topPath, "data") installMarkerPath := filepath.Join(topPath, ".installed") @@ -140,3 +146,10 @@ func randStr(length int) string { return string(runes) } + +func exeOnWindows(filename string) string { + if runtime.GOOS == define.Windows { + return filename + ".exe" + } + return filename +} diff --git a/testing/integration/upgrade_rollback_test.go b/testing/integration/upgrade_rollback_test.go new file mode 100644 index 00000000000..920a399ffb2 --- /dev/null +++ b/testing/integration/upgrade_rollback_test.go @@ -0,0 +1,205 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +//go:build integration + +package integration + +import ( + "context" + "errors" + "fmt" + "runtime" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" + "github.com/elastic/elastic-agent/internal/pkg/agent/install" + atesting "github.com/elastic/elastic-agent/pkg/testing" + "github.com/elastic/elastic-agent/pkg/testing/define" + "github.com/elastic/elastic-agent/testing/upgradetest" +) + +// TestStandaloneUpgradeRollback tests the scenario where upgrading to a new version +// of Agent fails due to the new Agent binary reporting an unhealthy status. It checks +// that the Agent is rolled back to the previous version. +func TestStandaloneUpgradeRollback(t *testing.T) { + define.Require(t, define.Requirements{ + Local: false, // requires Agent installation + Sudo: true, // requires Agent installation + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Start at the build version as we want to test the retry + // logic that is in the build. + startFixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + startVersionInfo, err := startFixture.ExecVersion(ctx) + require.NoError(t, err, "failed to get start agent build version info") + + // Upgrade to an old build. + upgradeToVersion, err := upgradetest.PreviousMinor(ctx, define.Version()) + require.NoError(t, err) + endFixture, err := atesting.NewFixture( + t, + upgradeToVersion, + atesting.WithFetcher(atesting.ArtifactFetcher()), + ) + require.NoError(t, err) + + t.Logf("Testing Elastic Agent upgrade from %s to %s...", define.Version(), upgradeToVersion) + + // Configure Agent with fast watcher configuration and also an invalid + // input when the Agent version matches the upgraded Agent version. This way + // the pre-upgrade version of the Agent runs healthy, but the post-upgrade + // version doesn't. + preInstallHook := func() error { + invalidInputPolicy := upgradetest.FastWatcherCfg + fmt.Sprintf(` +outputs: + default: + type: elasticsearch + hosts: [127.0.0.1:9200] + +inputs: + - condition: '${agent.version.version} == "%s"' + type: invalid + id: invalid-input +`, upgradeToVersion) + return startFixture.Configure(ctx, []byte(invalidInputPolicy)) + } + + // Use the post-upgrade hook to bypass the remainder of the PerformUpgrade + // because we want to do our own checks for the rollback. + var ErrPostExit = errors.New("post exit") + postUpgradeHook := func() error { + return ErrPostExit + } + + err = upgradetest.PerformUpgrade( + ctx, startFixture, endFixture, t, + upgradetest.WithPreInstallHook(preInstallHook), + upgradetest.WithPostUpgradeHook(postUpgradeHook)) + if !errors.Is(err, ErrPostExit) { + require.NoError(t, err) + } + + // rollback should now occur + + // wait for the agent to be healthy and back at the start version + err = upgradetest.WaitHealthyAndVersion(ctx, startFixture, startVersionInfo.Binary, 10*time.Minute, 10*time.Second, t) + if err != nil { + // agent never got healthy, but we need to ensure the watcher is stopped before continuing (this + // prevents this test failure from interfering with another test) + // this kills the watcher instantly and waits for it to be gone before continuing + watcherErr := upgradetest.WaitForNoWatcher(ctx, 1*time.Minute, time.Second, 100*time.Millisecond) + if watcherErr != nil { + t.Logf("failed to kill watcher due to agent not becoming healthy: %s", watcherErr) + } + } + require.NoError(t, err) + + // rollback should stop the watcher + // killTimeout is greater than timeout as the watcher should have been + // stopped on its own, and we don't want this test to hide that fact + err = upgradetest.WaitForNoWatcher(ctx, 2*time.Minute, 10*time.Second, 3*time.Minute) + require.NoError(t, err) + + // now that the watcher has stopped lets ensure that it's still the expected + // version, otherwise it's possible that it was rolled back to the original version + err = upgradetest.CheckHealthyAndVersion(ctx, startFixture, startVersionInfo.Binary) + assert.NoError(t, err) +} + +// TestStandaloneUpgradeRollbackOnRestarts tests the scenario where upgrading to a new version +// of Agent fails due to the new Agent binary not starting up. It checks that the Agent is +// rolled back to the previous version. +func TestStandaloneUpgradeRollbackOnRestarts(t *testing.T) { + define.Require(t, define.Requirements{ + Local: false, // requires Agent installation + Sudo: true, // requires Agent installation + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Start at the build version as we want to test the retry + // logic that is in the build. + startFixture, err := define.NewFixture(t, define.Version()) + require.NoError(t, err) + startVersionInfo, err := startFixture.ExecVersion(ctx) + require.NoError(t, err, "failed to get start agent build version info") + + // Upgrade to an old build. + upgradeToVersion, err := upgradetest.PreviousMinor(ctx, define.Version()) + require.NoError(t, err) + endFixture, err := atesting.NewFixture( + t, + upgradeToVersion, + atesting.WithFetcher(atesting.ArtifactFetcher()), + ) + require.NoError(t, err) + + t.Logf("Testing Elastic Agent upgrade from %s to %s...", define.Version(), upgradeToVersion) + + // Use the post-upgrade hook to bypass the remainder of the PerformUpgrade + // because we want to do our own checks for the rollback. + var ErrPostExit = errors.New("post exit") + postUpgradeHook := func() error { + return ErrPostExit + } + + err = upgradetest.PerformUpgrade( + ctx, startFixture, endFixture, t, + upgradetest.WithPostUpgradeHook(postUpgradeHook)) + if !errors.Is(err, ErrPostExit) { + require.NoError(t, err) + } + + // A few seconds after the upgrade, deliberately restart upgraded Agent a + // couple of times to simulate Agent crashing. + for restartIdx := 0; restartIdx < 3; restartIdx++ { + time.Sleep(10 * time.Second) + topPath := paths.Top() + + t.Logf("Restarting Agent via service to simulate crashing") + err = install.RestartService(topPath) + if err != nil && runtime.GOOS == define.Windows && strings.Contains(err.Error(), "The service has not been started.") { + // Due to the quick restarts every 10 seconds its possible that this is faster than Windows + // can handle. Decrementing restartIdx means that the loop will occur again. + t.Logf("Got an allowed error on Windows: %s", err) + restartIdx-- + continue + } + require.NoError(t, err) + } + + // wait for the agent to be healthy and back at the start version + err = upgradetest.WaitHealthyAndVersion(ctx, startFixture, startVersionInfo.Binary, 10*time.Minute, 10*time.Second, t) + if err != nil { + // agent never got healthy, but we need to ensure the watcher is stopped before continuing + // this kills the watcher instantly and waits for it to be gone before continuing + watcherErr := upgradetest.WaitForNoWatcher(ctx, 1*time.Minute, time.Second, 100*time.Millisecond) + if watcherErr != nil { + t.Logf("failed to kill watcher due to agent not becoming healthy: %s", watcherErr) + } + } + require.NoError(t, err) + + // rollback should stop the watcher + // killTimeout is greater than timeout as the watcher should have been + // stopped on its own, and we don't want this test to hide that fact + err = upgradetest.WaitForNoWatcher(ctx, 2*time.Minute, 10*time.Second, 3*time.Minute) + require.NoError(t, err) + + // now that the watcher has stopped lets ensure that it's still the expected + // version, otherwise it's possible that it was rolled back to the original version + err = upgradetest.CheckHealthyAndVersion(ctx, startFixture, startVersionInfo.Binary) + assert.NoError(t, err) +}