diff --git a/changelog/fragments/1718226176-Ensure.yaml b/changelog/fragments/1718226176-Ensure.yaml new file mode 100644 index 00000000000..e9d105b6d54 --- /dev/null +++ b/changelog/fragments/1718226176-Ensure.yaml @@ -0,0 +1,35 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Ensure socket can be created even when STATE_PATH is too long for container command + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: | + When running the Elastic-Agent container command and `STATE_PATH` is + set to a value that is too long the Unix socket cannot be created, + so the Elastic-Agent will use the default state path instead. + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/4909 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/cmd/container.go b/internal/pkg/agent/cmd/container.go index 8ceea38c14e..db454cf5fa9 100644 --- a/internal/pkg/agent/cmd/container.go +++ b/internal/pkg/agent/cmd/container.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "io" + "io/fs" "net/url" "os" "os/exec" @@ -34,6 +35,7 @@ import ( "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/core/process" + "github.com/elastic/elastic-agent/pkg/utils" "github.com/elastic/elastic-agent/version" ) @@ -774,13 +776,20 @@ func setPaths(statePath, configPath, logsPath, socketPath string, writePaths boo if statePath == "" { statePath = defaultStateDirectory } + topPath := filepath.Join(statePath, "data") configPath = envWithDefault(configPath, "CONFIG_PATH") if configPath == "" { configPath = statePath } + if _, err := os.Stat(configPath); errors.Is(err, fs.ErrNotExist) { + if err := os.MkdirAll(configPath, 0755); err != nil { + return fmt.Errorf("cannot create folders for config path '%s': %w", configPath, err) + } + } + if socketPath == "" { - socketPath = fmt.Sprintf("unix://%s", filepath.Join(topPath, paths.ControlSocketName)) + socketPath = utils.SocketURLWithFallback(statePath, topPath) } // ensure that the directory and sub-directory data exists if err := os.MkdirAll(topPath, 0755); err != nil { diff --git a/testing/integration/container_cmd_test.go b/testing/integration/container_cmd_test.go index c9c0b3eccbe..fa20c65e27f 100644 --- a/testing/integration/container_cmd_test.go +++ b/testing/integration/container_cmd_test.go @@ -10,6 +10,8 @@ import ( "context" "fmt" "os" + "os/exec" + "path/filepath" "strings" "testing" "time" @@ -18,29 +20,12 @@ import ( "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent-libs/kibana" + atesting "github.com/elastic/elastic-agent/pkg/testing" "github.com/elastic/elastic-agent/pkg/testing/define" "github.com/elastic/elastic-agent/pkg/testing/tools/fleettools" ) -func TestContainerCMD(t *testing.T) { - info := define.Require(t, define.Requirements{ - Stack: &define.Stack{}, - Local: false, - Sudo: true, - OS: []define.OS{ - {Type: define.Linux}, - }, - // This test runs the command we use when executing inside a container - // which leaves files under /usr/share/elastic-agent. Run it isolated - // to avoid interfering with other tests and better simulate a container - // environment we run it in isolation - Group: "container", - }) - ctx := context.Background() - - agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version()) - require.NoError(t, err) - +func createPolicy(t *testing.T, ctx context.Context, agentFixture *atesting.Fixture, info *define.Info) string { createPolicyReq := kibana.AgentPolicy{ Name: fmt.Sprintf("test-policy-enroll-%s", uuid.New().String()), Namespace: info.Namespace, @@ -74,13 +59,10 @@ func TestContainerCMD(t *testing.T) { t.Fatalf("unable to create enrolment API key: %s", err) } - fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient) - if err != nil { - t.Fatalf("could not get Fleet URL: %s", err) - } + return enrollmentToken.APIKey +} - ctx, cancel := context.WithTimeout(ctx, 1*time.Minute) - defer cancel() +func prepareContainerCMD(t *testing.T, ctx context.Context, agentFixture *atesting.Fixture, info *define.Info, env []string) *exec.Cmd { cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"}) if err != nil { t.Fatalf("could not prepare agent command: %s", err) @@ -112,22 +94,52 @@ func TestContainerCMD(t *testing.T) { agentOutput := strings.Builder{} cmd.Stderr = &agentOutput cmd.Stdout = &agentOutput - cmd.Env = append(os.Environ(), + cmd.Env = append(os.Environ(), env...) + return cmd +} + +func TestContainerCMD(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: true, + OS: []define.OS{ + {Type: define.Linux}, + }, + Group: "container", + }) + + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + + agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version()) + require.NoError(t, err) + + fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient) + if err != nil { + t.Fatalf("could not get Fleet URL: %s", err) + } + + enrollmentToken := createPolicy(t, ctx, agentFixture, info) + env := []string{ "FLEET_ENROLL=1", - "FLEET_URL="+fleetURL, - "FLEET_ENROLLMENT_TOKEN="+enrollmentToken.APIKey, + "FLEET_URL=" + fleetURL, + "FLEET_ENROLLMENT_TOKEN=" + enrollmentToken, // As the agent isn't built for a container, it's upgradable, triggering // the start of the upgrade watcher. If `STATE_PATH` isn't set, the // upgrade watcher will commence from a different path within the // container, distinct from the current execution path. - "STATE_PATH="+agentFixture.WorkDir(), - ) + "STATE_PATH=" + agentFixture.WorkDir(), + } + cmd := prepareContainerCMD(t, ctx, agentFixture, info, env) t.Logf(">> running binary with: %v", cmd.Args) if err := cmd.Start(); err != nil { t.Fatalf("error running container cmd: %s", err) } + agentOutput := cmd.Stderr.(*strings.Builder) + require.Eventuallyf(t, func() bool { // This will return errors until it connects to the agent, // they're mostly noise because until the agent starts running @@ -140,6 +152,109 @@ func TestContainerCMD(t *testing.T) { }, 5*time.Minute, time.Second, "Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s", - err, &agentOutput, + err, agentOutput, ) } + +func TestContainerCMDWithAVeryLongStatePath(t *testing.T) { + info := define.Require(t, define.Requirements{ + Stack: &define.Stack{}, + Local: false, + Sudo: true, + OS: []define.OS{ + {Type: define.Linux}, + }, + Group: "container", + }) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient) + if err != nil { + t.Fatalf("could not get Fleet URL: %s", err) + } + + testCases := map[string]struct { + statePath string + expectedStatePath string + expectedSocketPath string + expectError bool + }{ + "small path": { // Use the set path + statePath: filepath.Join(os.TempDir(), "foo", "bar"), + expectedStatePath: filepath.Join(os.TempDir(), "foo", "bar"), + expectedSocketPath: "/tmp/foo/bar/data/smp7BzlzcwgrLK4PUxpu7G1O5UwV4adr.sock", + }, + "no path set": { // Use the default path + statePath: "", + expectedStatePath: "/usr/share/elastic-agent/state", + expectedSocketPath: "/usr/share/elastic-agent/state/data/Td8I7R-Zby36_zF_IOd9QVNlFblNEro3.sock", + }, + "long path": { // Path too long to create a unix socket, it will use /tmp/elastic-agent + statePath: "/tmp/ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + expectedStatePath: "/tmp/ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", + expectedSocketPath: "/tmp/elastic-agent/Xegnlbb8QDcqNLPzyf2l8PhVHjWvlQgZ.sock", + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version()) + require.NoError(t, err) + + enrollmentToken := createPolicy(t, ctx, agentFixture, info) + env := []string{ + "FLEET_ENROLL=1", + "FLEET_URL=" + fleetURL, + "FLEET_ENROLLMENT_TOKEN=" + enrollmentToken, + "STATE_PATH=" + tc.statePath, + } + + cmd := prepareContainerCMD(t, ctx, agentFixture, info, env) + t.Logf(">> running binary with: %v", cmd.Args) + if err := cmd.Start(); err != nil { + t.Fatalf("error running container cmd: %s", err) + } + agentOutput := cmd.Stderr.(*strings.Builder) + + require.Eventuallyf(t, func() bool { + // This will return errors until it connects to the agent, + // they're mostly noise because until the agent starts running + // we will get connection errors. If the test fails + // the agent logs will be present in the error message + // which should help to explain why the agent was not + // healthy. + err = agentFixture.IsHealthy(ctx) + return err == nil + }, + 1*time.Minute, time.Second, + "Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s", + err, agentOutput, + ) + + t.Cleanup(func() { + _ = os.RemoveAll(tc.expectedStatePath) + }) + + // Now that the Elastic-Agent is healthy, check that the control socket path + // is the expected one + if _, err := os.Stat(tc.expectedStatePath); err != nil { + t.Errorf("cannot stat expected state path ('%s'): %s", tc.expectedStatePath, err) + } + if _, err := os.Stat(tc.expectedSocketPath); err != nil { + t.Errorf("cannot stat expected socket path ('%s'): %s", tc.expectedSocketPath, err) + } + + if t.Failed() { + containerPaths, err := os.ReadFile(filepath.Join(agentFixture.WorkDir(), "container-paths.yml")) + if err != nil { + t.Fatalf("could not read container-paths.yml: %s", err) + } + + t.Log("contents of 'container-paths-yml'") + t.Log(string(containerPaths)) + } + }) + } +}