Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure socket path is < 104 characters when STATE_PATH is set. #4909

Merged
merged 11 commits into from
Jun 21, 2024
35 changes: 35 additions & 0 deletions changelog/fragments/1718226176-Ensure.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Ensure socket can be created even when STATE_PATH is too long for container command

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
description: |
When running the Elastic-Agent container command and `STATE_PATH` is
set to a value that is too long the Unix socket cannot be created,
so the Elastic-Agent will use the default state path instead.

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/4909

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
11 changes: 10 additions & 1 deletion internal/pkg/agent/cmd/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"encoding/json"
"fmt"
"io"
"io/fs"
"net/url"
"os"
"os/exec"
Expand All @@ -34,6 +35,7 @@ import (
"github.com/elastic/elastic-agent/pkg/component"
"github.com/elastic/elastic-agent/pkg/core/logger"
"github.com/elastic/elastic-agent/pkg/core/process"
"github.com/elastic/elastic-agent/pkg/utils"
"github.com/elastic/elastic-agent/version"
)

Expand Down Expand Up @@ -774,13 +776,20 @@ func setPaths(statePath, configPath, logsPath, socketPath string, writePaths boo
if statePath == "" {
statePath = defaultStateDirectory
}

topPath := filepath.Join(statePath, "data")
configPath = envWithDefault(configPath, "CONFIG_PATH")
if configPath == "" {
configPath = statePath
}
if _, err := os.Stat(configPath); errors.Is(err, fs.ErrNotExist) {
if err := os.MkdirAll(configPath, 0755); err != nil {
return fmt.Errorf("cannot create folders for config path '%s': %w", configPath, err)
}
}

if socketPath == "" {
socketPath = fmt.Sprintf("unix://%s", filepath.Join(topPath, paths.ControlSocketName))
socketPath = utils.SocketURLWithFallback(statePath, topPath)
}
// ensure that the directory and sub-directory data exists
if err := os.MkdirAll(topPath, 0755); err != nil {
Expand Down
177 changes: 146 additions & 31 deletions testing/integration/container_cmd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"time"
Expand All @@ -18,29 +20,12 @@ import (
"github.com/stretchr/testify/require"

"github.com/elastic/elastic-agent-libs/kibana"
atesting "github.com/elastic/elastic-agent/pkg/testing"
"github.com/elastic/elastic-agent/pkg/testing/define"
"github.com/elastic/elastic-agent/pkg/testing/tools/fleettools"
)

func TestContainerCMD(t *testing.T) {
info := define.Require(t, define.Requirements{
Stack: &define.Stack{},
Local: false,
Sudo: true,
OS: []define.OS{
{Type: define.Linux},
},
// This test runs the command we use when executing inside a container
// which leaves files under /usr/share/elastic-agent. Run it isolated
// to avoid interfering with other tests and better simulate a container
// environment we run it in isolation
Group: "container",
})
ctx := context.Background()

agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version())
require.NoError(t, err)

func createPolicy(t *testing.T, ctx context.Context, agentFixture *atesting.Fixture, info *define.Info) string {
createPolicyReq := kibana.AgentPolicy{
Name: fmt.Sprintf("test-policy-enroll-%s", uuid.New().String()),
Namespace: info.Namespace,
Expand Down Expand Up @@ -74,13 +59,10 @@ func TestContainerCMD(t *testing.T) {
t.Fatalf("unable to create enrolment API key: %s", err)
}

fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient)
if err != nil {
t.Fatalf("could not get Fleet URL: %s", err)
}
return enrollmentToken.APIKey
}

ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
func prepareContainerCMD(t *testing.T, ctx context.Context, agentFixture *atesting.Fixture, info *define.Info, env []string) *exec.Cmd {
cmd, err := agentFixture.PrepareAgentCommand(ctx, []string{"container"})
if err != nil {
t.Fatalf("could not prepare agent command: %s", err)
Expand Down Expand Up @@ -112,22 +94,52 @@ func TestContainerCMD(t *testing.T) {
agentOutput := strings.Builder{}
cmd.Stderr = &agentOutput
cmd.Stdout = &agentOutput
cmd.Env = append(os.Environ(),
cmd.Env = append(os.Environ(), env...)
return cmd
}

func TestContainerCMD(t *testing.T) {
info := define.Require(t, define.Requirements{
Stack: &define.Stack{},
Local: false,
Sudo: true,
OS: []define.OS{
{Type: define.Linux},
},
Group: "container",
})

ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()

agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version())
require.NoError(t, err)

fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient)
if err != nil {
t.Fatalf("could not get Fleet URL: %s", err)
}

enrollmentToken := createPolicy(t, ctx, agentFixture, info)
env := []string{
"FLEET_ENROLL=1",
"FLEET_URL="+fleetURL,
"FLEET_ENROLLMENT_TOKEN="+enrollmentToken.APIKey,
"FLEET_URL=" + fleetURL,
"FLEET_ENROLLMENT_TOKEN=" + enrollmentToken,
// As the agent isn't built for a container, it's upgradable, triggering
// the start of the upgrade watcher. If `STATE_PATH` isn't set, the
// upgrade watcher will commence from a different path within the
// container, distinct from the current execution path.
"STATE_PATH="+agentFixture.WorkDir(),
)
"STATE_PATH=" + agentFixture.WorkDir(),
}

cmd := prepareContainerCMD(t, ctx, agentFixture, info, env)
t.Logf(">> running binary with: %v", cmd.Args)
if err := cmd.Start(); err != nil {
t.Fatalf("error running container cmd: %s", err)
}

agentOutput := cmd.Stderr.(*strings.Builder)
belimawr marked this conversation as resolved.
Show resolved Hide resolved

require.Eventuallyf(t, func() bool {
// This will return errors until it connects to the agent,
// they're mostly noise because until the agent starts running
Expand All @@ -140,6 +152,109 @@ func TestContainerCMD(t *testing.T) {
},
5*time.Minute, time.Second,
"Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s",
err, &agentOutput,
err, agentOutput,
)
}

func TestContainerCMDWithAVeryLongStatePath(t *testing.T) {
info := define.Require(t, define.Requirements{
Stack: &define.Stack{},
Local: false,
Sudo: true,
OS: []define.OS{
{Type: define.Linux},
},
Group: "container",
})

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()

fleetURL, err := fleettools.DefaultURL(ctx, info.KibanaClient)
if err != nil {
t.Fatalf("could not get Fleet URL: %s", err)
}

testCases := map[string]struct {
statePath string
expectedStatePath string
expectedSocketPath string
expectError bool
}{
"small path": { // Use the set path
statePath: filepath.Join(os.TempDir(), "foo", "bar"),
expectedStatePath: filepath.Join(os.TempDir(), "foo", "bar"),
expectedSocketPath: "/tmp/foo/bar/data/smp7BzlzcwgrLK4PUxpu7G1O5UwV4adr.sock",
},
"no path set": { // Use the default path
statePath: "",
expectedStatePath: "/usr/share/elastic-agent/state",
expectedSocketPath: "/usr/share/elastic-agent/state/data/Td8I7R-Zby36_zF_IOd9QVNlFblNEro3.sock",
},
"long path": { // Path too long to create a unix socket, it will use /tmp/elastic-agent
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't most of these test cases just tests of the SocketURLWithFallback() function? How long does this new set of tests take to run?

Do you get the same effective test coverage if you update the single existing container command to use a path longer than 107 characters as the exception path?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't most of these test cases just tests of the SocketURLWithFallback() function? How long does this new set of tests take to run?

No, they're testing the https://github.com/elastic/elastic-agent/blob/a6d91b029ae8c0b8ccf7bb391c8ac304fe1e9368/internal/pkg/agent/cmd/container.go#L773-L837 function and how it decides to create the socket path. The fact that it calls SocketURLWithFallback is an implementation detail. It also needs to create some folders:

All of this gets tested by the integration test.

Maybe we can remove one test case: 107 characters path as it essentially is the same as small path.

statePath: "/tmp/ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
expectedStatePath: "/tmp/ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
expectedSocketPath: "/tmp/elastic-agent/Xegnlbb8QDcqNLPzyf2l8PhVHjWvlQgZ.sock",
},
}

for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
agentFixture, err := define.NewFixtureFromLocalBuild(t, define.Version())
require.NoError(t, err)

enrollmentToken := createPolicy(t, ctx, agentFixture, info)
env := []string{
"FLEET_ENROLL=1",
"FLEET_URL=" + fleetURL,
"FLEET_ENROLLMENT_TOKEN=" + enrollmentToken,
"STATE_PATH=" + tc.statePath,
}

cmd := prepareContainerCMD(t, ctx, agentFixture, info, env)
t.Logf(">> running binary with: %v", cmd.Args)
if err := cmd.Start(); err != nil {
t.Fatalf("error running container cmd: %s", err)
}
agentOutput := cmd.Stderr.(*strings.Builder)

require.Eventuallyf(t, func() bool {
// This will return errors until it connects to the agent,
// they're mostly noise because until the agent starts running
// we will get connection errors. If the test fails
// the agent logs will be present in the error message
// which should help to explain why the agent was not
// healthy.
err = agentFixture.IsHealthy(ctx)
return err == nil
},
1*time.Minute, time.Second,
"Elastic-Agent did not report healthy. Agent status error: \"%v\", Agent logs\n%s",
err, agentOutput,
)

t.Cleanup(func() {
_ = os.RemoveAll(tc.expectedStatePath)
})

// Now that the Elastic-Agent is healthy, check that the control socket path
// is the expected one
if _, err := os.Stat(tc.expectedStatePath); err != nil {
t.Errorf("cannot stat expected state path ('%s'): %s", tc.expectedStatePath, err)
}
if _, err := os.Stat(tc.expectedSocketPath); err != nil {
t.Errorf("cannot stat expected socket path ('%s'): %s", tc.expectedSocketPath, err)
}

if t.Failed() {
containerPaths, err := os.ReadFile(filepath.Join(agentFixture.WorkDir(), "container-paths.yml"))
if err != nil {
t.Fatalf("could not read container-paths.yml: %s", err)
}

t.Log("contents of 'container-paths-yml'")
t.Log(string(containerPaths))
}
})
}
}