From cc1232e4be92b30625e6d38080d9d785536bb64c Mon Sep 17 00:00:00 2001 From: Romain Dauby Date: Wed, 8 Jan 2025 08:24:59 -0500 Subject: [PATCH] [extension/cgroupruntime] Be aware of ECS task and CPU limits (#36920) #### Description Allow the cgroupruntime extension to set GOMAXPROCS based on AWS ECS metadata. See related issue for detailed informations. #### Link to tracking issue Fixes #36814 #### Testing Added integration test with `httptest` for the ECS metadata endpoint. Something to clarify: https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/36617#issuecomment-2557877397 #### Documentation Added extension name and link in the README. --------- Signed-off-by: Romain Dauby Co-authored-by: Roger Coll --- ...k-ecs-metadata-cgroupruntimeextension.yaml | 27 ++ .../cgroupruntimeextension/CONTRIBUTING.md | 26 ++ extension/cgroupruntimeextension/README.md | 6 +- extension/cgroupruntimeextension/factory.go | 9 +- extension/cgroupruntimeextension/go.mod | 3 +- extension/cgroupruntimeextension/go.sum | 2 + .../integration_test.go | 235 +++++++++++++++--- 7 files changed, 272 insertions(+), 36 deletions(-) create mode 100644 .chloggen/check-ecs-metadata-cgroupruntimeextension.yaml create mode 100644 extension/cgroupruntimeextension/CONTRIBUTING.md diff --git a/.chloggen/check-ecs-metadata-cgroupruntimeextension.yaml b/.chloggen/check-ecs-metadata-cgroupruntimeextension.yaml new file mode 100644 index 000000000000..0a8a2728ee60 --- /dev/null +++ b/.chloggen/check-ecs-metadata-cgroupruntimeextension.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: cgroupruntimeextension + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Implement ECS metadata retrieval for cgroupruntime extension. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [36814] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] \ No newline at end of file diff --git a/extension/cgroupruntimeextension/CONTRIBUTING.md b/extension/cgroupruntimeextension/CONTRIBUTING.md new file mode 100644 index 000000000000..a5fbbc80db1b --- /dev/null +++ b/extension/cgroupruntimeextension/CONTRIBUTING.md @@ -0,0 +1,26 @@ +# Contributing to the Cgroup Go runtime extension + +In order to contribute to this extension, it might be useful to have a working local setup. + +## Testing + +Some Linux distributions don't run systemd under cgroupv2, to run the integration tests locally for this extension you can follow these steps. + +Inside the extension folder, start a privileged docker container and share the code with the container + +```bash +cd extension/cgroupruntimeextension +docker run -ti --privileged --cgroupns=host -v $(pwd):/workspace -w /workspace debian:bookworm-slim +``` + +Install the [Go version](https://go.dev/dl/) specified in the extension's [go.mod](./go.mod) and the GCC compiler to run the integration test. The following is an example command for Go `1.23.4` in and `amd64` system: + +```bash +apt update && apt install -y wget sudo gcc && wget https://go.dev/dl/go1.23.4.linux-amd64.tar.gz && tar -C /usr/local -xzf go1.23.4.linux-amd64.tar.gz && export PATH=$PATH:/usr/local/go/bin && go version && rm go1.23.4.linux-amd64.tar.gz +``` + +Run the integration test + +```bash +CGO_ENABLED=1 go test -v -exec sudo -race -timeout 360s -parallel 4 -tags=integration,"" +``` diff --git a/extension/cgroupruntimeextension/README.md b/extension/cgroupruntimeextension/README.md index f7d79099d6c1..8af1a28d88b1 100644 --- a/extension/cgroupruntimeextension/README.md +++ b/extension/cgroupruntimeextension/README.md @@ -15,7 +15,7 @@ ## Overview -The OpenTelemetry Cgroup Auto-Config Extension is designed to optimize Go runtime performance in containerized environments by automatically configuring GOMAXPROCS and GOMEMLIMIT based on the Linux cgroup filesystem. This extension leverages [automaxprocs](https://github.com/uber-go/automaxprocs) and [automemlimit](https://github.com/KimMachineGun/automemlimit) packages to dynamically adjust Go runtime variables, ensuring efficient resource usage aligned with container limits. +The OpenTelemetry Cgroup Auto-Config Extension is designed to optimize Go runtime performance in containerized environments by automatically configuring GOMAXPROCS and GOMEMLIMIT based on the Linux cgroup filesystem. This extension leverages [automaxprocs](https://github.com/uber-go/automaxprocs) or [gomaxecs](https://github.com/rdforte/gomaxecs) for AWS ECS Tasks and [automemlimit](https://github.com/KimMachineGun/automemlimit) packages to dynamically adjust Go runtime variables, ensuring efficient resource usage aligned with container limits. ## Configuration @@ -40,3 +40,7 @@ extension: enabled: true ratio: 0.8 ``` + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for information on how to contribute to this extension. diff --git a/extension/cgroupruntimeextension/factory.go b/extension/cgroupruntimeextension/factory.go index 1905811dc3b8..fdc6fca6fe12 100644 --- a/extension/cgroupruntimeextension/factory.go +++ b/extension/cgroupruntimeextension/factory.go @@ -9,6 +9,7 @@ import ( "runtime/debug" "github.com/KimMachineGun/automemlimit/memlimit" + gomaxecs "github.com/rdforte/gomaxecs/maxprocs" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/extension" "go.uber.org/automaxprocs/maxprocs" @@ -42,10 +43,14 @@ func createExtension(_ context.Context, set extension.Settings, cfg component.Co cgroupConfig := cfg.(*Config) return newCgroupRuntime(cgroupConfig, set.Logger, func() (undoFunc, error) { - undo, err := maxprocs.Set(maxprocs.Logger(func(str string, params ...any) { + if gomaxecs.IsECS() { + return gomaxecs.Set(gomaxecs.WithLogger(func(str string, params ...any) { + set.Logger.Debug(fmt.Sprintf(str, params)) + })) + } + return maxprocs.Set(maxprocs.Logger(func(str string, params ...any) { set.Logger.Debug(fmt.Sprintf(str, params)) })) - return undoFunc(undo), err }, func(ratio float64) (undoFunc, error) { initial, err := memlimit.SetGoMemLimitWithOpts(memlimit.WithRatio(ratio)) diff --git a/extension/cgroupruntimeextension/go.mod b/extension/cgroupruntimeextension/go.mod index 6cb62ee140ad..5a853b18e436 100644 --- a/extension/cgroupruntimeextension/go.mod +++ b/extension/cgroupruntimeextension/go.mod @@ -1,10 +1,11 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/extension/cgroupruntimeextension -go 1.22.0 +go 1.22.4 require ( github.com/KimMachineGun/automemlimit v0.7.0 github.com/containerd/cgroups/v3 v3.0.5 + github.com/rdforte/gomaxecs v1.1.0 github.com/stretchr/testify v1.10.0 go.opentelemetry.io/collector/component v0.117.0 go.opentelemetry.io/collector/component/componenttest v0.117.0 diff --git a/extension/cgroupruntimeextension/go.sum b/extension/cgroupruntimeextension/go.sum index a6f22f103970..b889e27a984e 100644 --- a/extension/cgroupruntimeextension/go.sum +++ b/extension/cgroupruntimeextension/go.sum @@ -63,6 +63,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= +github.com/rdforte/gomaxecs v1.1.0 h1:fpDkJtuBRtRQjcxAKdARjwjYzxlmmGkSmcqzF0UKuOg= +github.com/rdforte/gomaxecs v1.1.0/go.mod h1:8agrawOmcvb+oBa6EnV2oADDtnDtkVx1Q0H/Ht7GiFc= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= diff --git a/extension/cgroupruntimeextension/integration_test.go b/extension/cgroupruntimeextension/integration_test.go index 1dd8c89647ff..29032012a295 100644 --- a/extension/cgroupruntimeextension/integration_test.go +++ b/extension/cgroupruntimeextension/integration_test.go @@ -11,6 +11,8 @@ import ( "context" "fmt" "math" + "net/http" + "net/http/httptest" "os" "path" "path/filepath" @@ -30,6 +32,7 @@ import ( const ( defaultCgroup2Path = "/sys/fs/cgroup" + ecsMetadataURI = "ECS_CONTAINER_METADATA_URI_V4" ) // checkCgroupSystem skips the test if is not run in a cgroupv2 system @@ -46,7 +49,40 @@ func checkCgroupSystem(tb testing.TB) { } } -// cgroupMaxCPU returns the CPU max definition for a given cgroup slice path +func pointerInt64(val int64) *int64 { + return &val +} + +func pointerUint64(uval uint64) *uint64 { + return &uval +} + +// setupMemoryCgroupCleanUp returns a cleanup function that restores the cgroup's max memory to its initial value +func setupMemoryCgroupCleanUp(t *testing.T, manager *cgroup2.Manager, cgroupPath string) func() { + stats, err := manager.Stat() + require.NoError(t, err) + + initialMaxMemory := stats.GetMemory().GetUsageLimit() + memoryCgroupCleanUp := func() { + err = manager.Update(&cgroup2.Resources{ + Memory: &cgroup2.Memory{ + Max: pointerInt64(int64(initialMaxMemory)), + }, + }) + assert.NoError(t, err) + } + + if initialMaxMemory == math.MaxUint64 { + // fallback solution to set cgroup's max memory to "max" + memoryCgroupCleanUp = func() { + err = os.WriteFile(path.Join(defaultCgroup2Path, cgroupPath, "memory.max"), []byte("max"), 0o600) + assert.NoError(t, err) + } + } + return memoryCgroupCleanUp +} + +// cgroupMaxCpu returns the CPU max definition for a given cgroup slice path // File format: cpu_quote cpu_period func cgroupMaxCPU(filename string) (quota int64, period uint64, err error) { out, err := os.ReadFile(filepath.Join(defaultCgroup2Path, filename, "cpu.max")) @@ -63,14 +99,19 @@ func cgroupMaxCPU(filename string) (quota int64, period uint64, err error) { return quota, period, err } +// startExtension starts the extension with the given config +func startExtension(t *testing.T, config *Config) { + factory := NewFactory() + ctx := context.Background() + extension, err := factory.Create(ctx, extensiontest.NewNopSettings(), config) + require.NoError(t, err) + + err = extension.Start(ctx, componenttest.NewNopHost()) + require.NoError(t, err) +} + func TestCgroupV2SudoIntegration(t *testing.T) { checkCgroupSystem(t) - pointerInt64 := func(val int64) *int64 { - return &val - } - pointerUint64 := func(uval uint64) *uint64 { - return &uval - } tests := []struct { name string @@ -151,27 +192,8 @@ func TestCgroupV2SudoIntegration(t *testing.T) { manager, err := cgroup2.Load(cgroupPath) assert.NoError(t, err) - stats, err := manager.Stat() - require.NoError(t, err) - // Startup resource values - initialMaxMemory := stats.GetMemory().GetUsageLimit() - memoryCgroupCleanUp := func() { - err = manager.Update(&cgroup2.Resources{ - Memory: &cgroup2.Memory{ - Max: pointerInt64(int64(initialMaxMemory)), - }, - }) - assert.NoError(t, err) - } - - if initialMaxMemory == math.MaxUint64 { - // fallback solution to set cgroup's max memory to "max" - memoryCgroupCleanUp = func() { - err = os.WriteFile(path.Join(defaultCgroup2Path, cgroupPath, "memory.max"), []byte("max"), 0o600) - assert.NoError(t, err) - } - } + memoryCgroupCleanUp := setupMemoryCgroupCleanUp(t, manager, cgroupPath) initialCPUQuota, initialCPUPeriod, err := cgroupMaxCPU(cgroupPath) require.NoError(t, err) @@ -220,14 +242,163 @@ func TestCgroupV2SudoIntegration(t *testing.T) { }) require.NoError(t, err) - factory := NewFactory() - ctx := context.Background() - extension, err := factory.Create(ctx, extensiontest.NewNopSettings(), test.config) - require.NoError(t, err) + startExtension(t, test.config) + + assert.Equal(t, test.expectedGoMaxProcs, runtime.GOMAXPROCS(-1)) + assert.Equal(t, test.expectedGoMemLimit, debug.SetMemoryLimit(-1)) + }) + } +} + +func testServerECSMetadata(t *testing.T, containerCPU, taskCPU int) *httptest.Server { + t.Helper() + + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, _ *http.Request) { + _, err := w.Write([]byte(fmt.Sprintf(`{"Limits":{"CPU":%d},"DockerId":"container-id"}`, containerCPU))) + assert.NoError(t, err) + }) + mux.HandleFunc("/task", func(w http.ResponseWriter, _ *http.Request) { + _, err := w.Write([]byte(fmt.Sprintf( + `{"Containers":[{"DockerId":"container-id","Limits":{"CPU":%d}}],"Limits":{"CPU":%d}}`, + containerCPU, + taskCPU, + ))) + assert.NoError(t, err) + }) + + return httptest.NewServer(mux) +} + +func TestECSCgroupV2SudoIntegration(t *testing.T) { + checkCgroupSystem(t) + + tests := []struct { + name string + containerCPU int + taskCPU int + cgroupMaxMemory int64 + config *Config + expectedGoMaxProcs int + expectedGoMemLimit int64 + }{ + { + name: "90% the max cgroup memory and 4 GOMAXPROCS w/ 4096 container cpu 16 task cpu", + containerCPU: 4096, + taskCPU: 16, + // 128 Mb + cgroupMaxMemory: 134217728, + config: &Config{ + GoMaxProcs: GoMaxProcsConfig{ + Enabled: true, + }, + GoMemLimit: GoMemLimitConfig{ + Enabled: true, + Ratio: 0.9, + }, + }, + expectedGoMaxProcs: 4, + // 134217728 * 0.9 + expectedGoMemLimit: 120795955, + }, + { + name: "50% of the max cgroup memory and 1 GOMAXPROCS w/ 2048 container cpu 2 task cpu", + containerCPU: 2048, + taskCPU: 2, + // 128 Mb + cgroupMaxMemory: 134217728, + config: &Config{ + GoMaxProcs: GoMaxProcsConfig{ + Enabled: true, + }, + GoMemLimit: GoMemLimitConfig{ + Enabled: true, + Ratio: 0.5, + }, + }, + expectedGoMaxProcs: 2, + // 134217728 * 0.5 + expectedGoMemLimit: 67108864, + }, + { + name: "50% of the max cgroup memory and 1 GOMAXPROCS w/ 1024 container cpu 4 task cpu", + containerCPU: 1024, + taskCPU: 4, + // 128 Mb + cgroupMaxMemory: 134217728, + config: &Config{ + GoMaxProcs: GoMaxProcsConfig{ + Enabled: true, + }, + GoMemLimit: GoMemLimitConfig{ + Enabled: true, + Ratio: 0.5, + }, + }, + expectedGoMaxProcs: 1, + // 134217728 * 0.5 + expectedGoMemLimit: 67108864, + }, + { + name: "10% of the max cgroup memory and 4 GOMAXPROCS w/ 4096 container cpu 0 task cpu", + containerCPU: 4096, + taskCPU: 0, + // 128 Mb + cgroupMaxMemory: 134217728, + config: &Config{ + GoMaxProcs: GoMaxProcsConfig{ + Enabled: true, + }, + GoMemLimit: GoMemLimitConfig{ + Enabled: true, + Ratio: 0.1, + }, + }, + expectedGoMaxProcs: 4, + // 134217728 * 0.1 + expectedGoMemLimit: 13421772, + }, + } + + cgroupPath, err := cgroup2.PidGroupPath(os.Getpid()) + assert.NoError(t, err) + manager, err := cgroup2.Load(cgroupPath) + assert.NoError(t, err) + + // Startup resource values + memoryCgroupCleanUp := setupMemoryCgroupCleanUp(t, manager, cgroupPath) - err = extension.Start(ctx, componenttest.NewNopHost()) + initialGoMem := debug.SetMemoryLimit(-1) + initialGoProcs := runtime.GOMAXPROCS(-1) + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // running in ECS environment, set the ECS metedata URI environment variable + // to get the Cgroup CPU quota from the httptest server + server := testServerECSMetadata(t, test.containerCPU, test.taskCPU) + t.Setenv(ecsMetadataURI, server.URL) + // restore startup cgroup initial resource values + t.Cleanup(func() { + debug.SetMemoryLimit(initialGoMem) + runtime.GOMAXPROCS(initialGoProcs) + memoryCgroupCleanUp() + server.Close() + os.Unsetenv(ecsMetadataURI) + }) + + err = manager.Update(&cgroup2.Resources{ + Memory: &cgroup2.Memory{ + // Default max memory must be + // overwritten + // to automemlimit change the GOMEMLIMIT + // value + Max: pointerInt64(test.cgroupMaxMemory), + }, + }) require.NoError(t, err) + startExtension(t, test.config) + assert.Equal(t, test.expectedGoMaxProcs, runtime.GOMAXPROCS(-1)) assert.Equal(t, test.expectedGoMemLimit, debug.SetMemoryLimit(-1)) })