diff --git a/changelog/fragments/1696249276-Start-stop-monitoring-server-based-on-monitoring-config.yaml b/changelog/fragments/1696249276-Start-stop-monitoring-server-based-on-monitoring-config.yaml new file mode 100644 index 00000000000..141c18715bf --- /dev/null +++ b/changelog/fragments/1696249276-Start-stop-monitoring-server-based-on-monitoring-config.yaml @@ -0,0 +1,31 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Start/stop monitoring server based on monitoring config + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: elastic-agent + +# PR number; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: 3492 + +# Issue number; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: 2735 diff --git a/docs/test-framework-dev-guide.md b/docs/test-framework-dev-guide.md index 0821f1a5996..3e7ef9383b8 100644 --- a/docs/test-framework-dev-guide.md +++ b/docs/test-framework-dev-guide.md @@ -10,7 +10,7 @@ Go version should be at least the same than the one in [.go-version](https://git ### Configuration -ESS (QA) API Key to create on https://console.qa.cld.elstc.co/deployment-features/keys +ESS (staging) API Key to create on https://staging.found.no/account/keys Warning: if you never created a deployment on it, you won't have permission to get this key, so you will need to create one first. diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index e6a2dbc182f..856c624076f 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -158,6 +158,10 @@ type ComponentsModifier func(comps []component.Component, cfg map[string]interfa // CoordinatorShutdownTimeout is how long the coordinator will wait during shutdown to receive a "clean" shutdown from other components var CoordinatorShutdownTimeout = time.Second * 5 +type configReloader interface { + Reload(*config.Config) error +} + // Coordinator manages the entire state of the Elastic Agent. // // All configuration changes, update variables, and upgrade actions are managed and controlled by the coordinator. @@ -173,6 +177,8 @@ type Coordinator struct { upgradeMgr UpgradeManager monitorMgr MonitorManager + monitoringServerReloader configReloader + runtimeMgr RuntimeManager configMgr ConfigManager varsMgr VarsManager @@ -365,6 +371,10 @@ func (c *Coordinator) State() State { return c.stateBroadcaster.Get() } +func (c *Coordinator) RegisterMonitoringServer(s configReloader) { + c.monitoringServerReloader = s +} + // StateSubscribe returns a channel that reports changes in Coordinator state. // // bufferLen specifies how many state changes should be queued in addition to @@ -1008,6 +1018,12 @@ func (c *Coordinator) generateAST(cfg *config.Config) (err error) { } } + if c.monitoringServerReloader != nil { + if err := c.monitoringServerReloader.Reload(cfg); err != nil { + return fmt.Errorf("failed to reload monitor manager configuration: %w", err) + } + } + c.ast = rawAst return nil } diff --git a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go index 805139f26e8..dbcaa8b616e 100644 --- a/internal/pkg/agent/application/coordinator/coordinator_unit_test.go +++ b/internal/pkg/agent/application/coordinator/coordinator_unit_test.go @@ -24,10 +24,12 @@ import ( "github.com/elastic/elastic-agent-client/v7/pkg/client" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/reload" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade" "github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/artifact" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" "github.com/elastic/elastic-agent/internal/pkg/config" + monitoringCfg "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" "github.com/elastic/elastic-agent/pkg/component" "github.com/elastic/elastic-agent/pkg/component/runtime" agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client" @@ -520,6 +522,137 @@ inputs: } } +func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) { + // Send a test policy to the Coordinator as a Config Manager update, + // verify it generates the right component model and sends it to the + // runtime manager, then send an empty policy and verify it calls + // another runtime manager update with an empty component model. + + // Set a one-second timeout -- nothing here should block, but if it + // does let's report a failure instead of timing out the test runner. + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + logger := logp.NewLogger("testing") + + configChan := make(chan ConfigChange, 1) + + // Create a mocked runtime manager that will report the update call + runtimeManager := &fakeRuntimeManager{ + updateCallback: func(comp []component.Component) error { + return nil + }, + } + + monitoringServer := &fakeMonitoringServer{} + newServerFn := func() (reload.ServerController, error) { + return monitoringServer, nil + } + monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig()) + + coord := &Coordinator{ + logger: logger, + agentInfo: &info.AgentInfo{}, + stateBroadcaster: broadcaster.New(State{}, 0, 0), + managerChans: managerChans{ + configManagerUpdate: configChan, + }, + runtimeMgr: runtimeManager, + vars: emptyVars(t), + } + coord.RegisterMonitoringServer(monitoringReloader) + + // Create a policy with one input and one output + cfg := config.MustNewConfigFrom(` +outputs: + default: + type: elasticsearch +inputs: + - id: test-input + type: filestream + use_output: default +`) + + // Send the policy change and make sure it was acknowledged. + cfgChange := &configChange{cfg: cfg} + configChan <- cfgChange + coord.runLoopIteration(ctx) + assert.True(t, cfgChange.acked, "Coordinator should ACK a successful policy change") + + // server is started by default + assert.True(t, monitoringServer.startTriggered) + assert.True(t, monitoringServer.isRunning) + + // disable monitoring + cfgDisableMonitoring := config.MustNewConfigFrom(` +agent.monitoring.enabled: false +outputs: + default: + type: elasticsearch +inputs: + - id: test-input + type: filestream + use_output: default +`) + + // Send the policy change and make sure it was acknowledged. + monitoringServer.Reset() + cfgChange = &configChange{cfg: cfgDisableMonitoring} + configChan <- cfgChange + coord.runLoopIteration(ctx) + assert.True(t, cfgChange.acked, "Coordinator should ACK a successful policy change") + + // server is stopped: monitoring is disabled + assert.True(t, monitoringServer.stopTriggered) + assert.False(t, monitoringServer.isRunning) + + // enable monitoring + cfgEnabledMonitoring := config.MustNewConfigFrom(` +agent.monitoring.enabled: true +outputs: + default: + type: elasticsearch +inputs: + - id: test-input + type: filestream + use_output: default +`) + + // Send the policy change and make sure it was acknowledged. + monitoringServer.Reset() + cfgChange = &configChange{cfg: cfgEnabledMonitoring} + configChan <- cfgChange + coord.runLoopIteration(ctx) + assert.True(t, cfgChange.acked, "Coordinator should ACK a successful policy change") + + // server is started again + assert.True(t, monitoringServer.startTriggered) + assert.True(t, monitoringServer.isRunning) + + // enable monitoring and disable metrics + cfgEnabledMonitoringNoMetrics := config.MustNewConfigFrom(` +agent.monitoring.enabled: true +agent.monitoring.metrics: false +outputs: + default: + type: elasticsearch +inputs: + - id: test-input + type: filestream + use_output: default +`) + + // Send the policy change and make sure it was acknowledged. + monitoringServer.Reset() + cfgChange = &configChange{cfg: cfgEnabledMonitoringNoMetrics} + configChan <- cfgChange + coord.runLoopIteration(ctx) + assert.True(t, cfgChange.acked, "Coordinator should ACK a successful policy change") + + // server is stopped: monitoring.metrics is disabled + assert.True(t, monitoringServer.stopTriggered) + assert.False(t, monitoringServer.isRunning) +} + func TestCoordinatorPolicyChangeUpdatesRuntimeManager(t *testing.T) { // Send a test policy to the Coordinator as a Config Manager update, // verify it generates the right component model and sends it to the @@ -861,3 +994,25 @@ func emptyAST(t *testing.T) *transpiler.AST { require.NoError(t, err, "AST creation must succeed") return ast } + +type fakeMonitoringServer struct { + startTriggered bool + stopTriggered bool + isRunning bool +} + +func (fs *fakeMonitoringServer) Start() { + fs.startTriggered = true + fs.isRunning = true +} + +func (fs *fakeMonitoringServer) Stop() error { + fs.stopTriggered = true + fs.isRunning = false + return nil +} + +func (fs *fakeMonitoringServer) Reset() { + fs.stopTriggered = false + fs.startTriggered = false +} diff --git a/internal/pkg/agent/application/monitoring/reload/reload.go b/internal/pkg/agent/application/monitoring/reload/reload.go new file mode 100644 index 00000000000..b39d46474f9 --- /dev/null +++ b/internal/pkg/agent/application/monitoring/reload/reload.go @@ -0,0 +1,120 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package reload + +import ( + "sync" + + "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" + "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + aConfig "github.com/elastic/elastic-agent/internal/pkg/config" + monitoringCfg "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" + "github.com/elastic/elastic-agent/pkg/core/logger" +) + +// ServerController controls the server runtime +type ServerController interface { + Start() + Stop() error +} +type serverConstructor func() (ServerController, error) + +type ServerReloader struct { + s ServerController + log *logger.Logger + newServerFn serverConstructor + + config *monitoringCfg.MonitoringConfig + isServerRunning bool + isServerRunningLock sync.Mutex +} + +func NewServerReloader(newServerFn serverConstructor, log *logger.Logger, mcfg *monitoringCfg.MonitoringConfig) *ServerReloader { + sr := &ServerReloader{ + log: log, + config: mcfg, + newServerFn: newServerFn, + } + + return sr +} + +func (sr *ServerReloader) Start() { + sr.isServerRunningLock.Lock() + defer sr.isServerRunningLock.Unlock() + + sr.start() +} + +func (sr *ServerReloader) start() { + if sr.s != nil && sr.isServerRunning { + // server is already running + return + } + + sr.log.Info("Starting server") + var err error + sr.s, err = sr.newServerFn() + if err != nil { + sr.log.Errorf("Failed creating a server: %v", err) + return + } + + sr.s.Start() + sr.log.Debugf("Server started") + sr.isServerRunning = true +} + +func (sr *ServerReloader) Stop() error { + sr.isServerRunningLock.Lock() + defer sr.isServerRunningLock.Unlock() + + return sr.stop() +} + +func (sr *ServerReloader) stop() error { + if sr.s == nil { + // stopping not started server + sr.isServerRunning = false + return nil + } + sr.log.Info("Stopping server") + + sr.isServerRunning = false + if err := sr.s.Stop(); err != nil { + return err + } + + sr.log.Debugf("Server stopped") + sr.s = nil + return nil +} + +func (sr *ServerReloader) Reload(rawConfig *aConfig.Config) error { + sr.isServerRunningLock.Lock() + defer sr.isServerRunningLock.Unlock() + + newConfig := configuration.DefaultConfiguration() + if err := rawConfig.Unpack(&newConfig); err != nil { + return errors.New(err, "failed to unpack monitoring config during reload") + } + + sr.config = newConfig.Settings.MonitoringConfig + + shouldRunMetrics := sr.config.Enabled && sr.config.MonitorMetrics + if shouldRunMetrics && !sr.isServerRunning { + sr.start() + + sr.isServerRunning = true + return nil + } + + if !shouldRunMetrics && sr.isServerRunning { + sr.isServerRunning = false + return sr.stop() + } + + return nil +} diff --git a/internal/pkg/agent/application/monitoring/reload/reload_test.go b/internal/pkg/agent/application/monitoring/reload/reload_test.go new file mode 100644 index 00000000000..b686c5346a8 --- /dev/null +++ b/internal/pkg/agent/application/monitoring/reload/reload_test.go @@ -0,0 +1,130 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package reload + +import ( + "testing" + + "github.com/stretchr/testify/require" + + aConfig "github.com/elastic/elastic-agent/internal/pkg/config" + monitoringCfg "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" + "github.com/elastic/elastic-agent/pkg/core/logger" +) + +func TestReload(t *testing.T) { + tcs := []struct { + name string + currEnabled bool + currMetrics bool + currRunning bool + + newConfig string + expectedRunning bool + expectedStart bool + expectedStop bool + }{ + { + "start with default config", + false, false, false, + ``, + true, true, false, + }, + { + "start when not running, monitoring enabled", + false, false, false, + ` +agent.monitoring.enabled: true +`, + true, true, false, + }, + { + "do not start when not running, only metrics enabled", + false, false, false, + ` +agent.monitoring.enabled: false +agent.monitoring.metrics: true +`, + false, false, false, + }, + + { + "stop when running, monitoring disabled", + true, true, true, + ` +agent.monitoring.enabled: false +`, + false, false, true, + }, + { + "stop when running, monitoring.metrics disabled", + true, true, true, + ` +agent.monitoring.metrics: false +`, + false, false, true, + }, + { + "stop stopped server", + false, false, false, + ` +agent.monitoring.metrics: false +`, + false, false, false, + }, + { + "start started server", + true, true, true, + ` +agent.monitoring.enabled: true +`, + true, false, false, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + fsc := &fakeServerController{} + log, _ := logger.NewTesting(tc.name) + cfg := &monitoringCfg.MonitoringConfig{ + Enabled: tc.currEnabled, + MonitorMetrics: tc.currMetrics, + } + r := NewServerReloader( + func() (ServerController, error) { + return fsc, nil + }, + log, + cfg, + ) + r.isServerRunning = tc.currRunning + if tc.currRunning { + r.s = fsc + } + + newCfg := aConfig.MustNewConfigFrom(tc.newConfig) + require.NoError(t, r.Reload(newCfg)) + + require.Equal(t, tc.expectedRunning, r.isServerRunning) + require.Equal(t, tc.expectedStart, fsc.startTriggered) + require.Equal(t, tc.expectedStop, fsc.stopTriggered) + }) + } +} + +type fakeServerController struct { + startTriggered bool + stopTriggered bool +} + +func (fsc *fakeServerController) Start() { fsc.startTriggered = true } +func (fsc *fakeServerController) Stop() error { + fsc.stopTriggered = true + return nil +} +func (fsc *fakeServerController) Reset() { + fsc.startTriggered = false + fsc.stopTriggered = false +} diff --git a/internal/pkg/agent/application/monitoring/server.go b/internal/pkg/agent/application/monitoring/server.go index 390a472d5ed..7f43bf1866b 100644 --- a/internal/pkg/agent/application/monitoring/server.go +++ b/internal/pkg/agent/application/monitoring/server.go @@ -20,6 +20,9 @@ import ( "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/elastic-agent-libs/monitoring" "github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/reload" + "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + monitoringCfg "github.com/elastic/elastic-agent/internal/pkg/core/monitoring/config" "github.com/elastic/elastic-agent/pkg/core/logger" ) @@ -32,7 +35,8 @@ func NewServer( coord *coordinator.Coordinator, enableProcessStats bool, operatingSystem string, -) (*api.Server, error) { + mcfg *monitoringCfg.MonitoringConfig, +) (*reload.ServerReloader, error) { if err := createAgentMonitoringDrop(endpointConfig.Host); err != nil { // log but ignore log.Errorf("failed to create monitoring drop: %v", err) @@ -43,7 +47,7 @@ func NewServer( return nil, err } - return exposeMetricsEndpoint(log, cfg, ns, tracer, coord, enableProcessStats, operatingSystem) + return exposeMetricsEndpoint(log, cfg, ns, tracer, coord, enableProcessStats, operatingSystem, mcfg) } func exposeMetricsEndpoint( @@ -54,7 +58,8 @@ func exposeMetricsEndpoint( coord *coordinator.Coordinator, enableProcessStats bool, operatingSystem string, -) (*api.Server, error) { + mcfg *monitoringCfg.MonitoringConfig, +) (*reload.ServerReloader, error) { r := mux.NewRouter() if tracer != nil { r.Use(apmgorilla.Middleware(apmgorilla.WithTracer(tracer))) @@ -72,7 +77,15 @@ func exposeMetricsEndpoint( mux := http.NewServeMux() mux.Handle("/", r) - return api.New(log, mux, config) + newServerFn := func() (reload.ServerController, error) { + apiServer, err := api.New(log, mux, config) + if err != nil { + return nil, errors.New(err, "failed to create api server") + } + return apiServer, nil + } + + return reload.NewServerReloader(newServerFn, log, mcfg), nil } func createAgentMonitoringDrop(drop string) error { diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index 91d470ac00b..353c9d1e7a7 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -32,6 +32,7 @@ import ( "github.com/elastic/elastic-agent/internal/pkg/agent/application/filelock" "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/monitoring/reload" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" "github.com/elastic/elastic-agent/internal/pkg/agent/application/secret" @@ -248,12 +249,15 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration } defer composable.Close() - serverStopFn, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord) + monitoringServer, err := setupMetrics(l, cfg.Settings.DownloadConfig.OS(), cfg.Settings.MonitoringConfig, tracer, coord) if err != nil { return err } + coord.RegisterMonitoringServer(monitoringServer) defer func() { - _ = serverStopFn() + if monitoringServer != nil { + _ = monitoringServer.Stop() + } }() diagHooks := diagnostics.GlobalHooks() @@ -547,7 +551,7 @@ func setupMetrics( cfg *monitoringCfg.MonitoringConfig, tracer *apm.Tracer, coord *coordinator.Coordinator, -) (func() error, error) { +) (*reload.ServerReloader, error) { if err := report.SetupMetrics(logger, agentName, version.GetDefaultVersion()); err != nil { return nil, err } @@ -558,14 +562,12 @@ func setupMetrics( Host: monitoring.AgentMonitoringEndpoint(operatingSystem, cfg), } - s, err := monitoring.NewServer(logger, endpointConfig, monitoringLib.GetNamespace, tracer, coord, isProcessStatsEnabled(cfg), operatingSystem) + s, err := monitoring.NewServer(logger, endpointConfig, monitoringLib.GetNamespace, tracer, coord, isProcessStatsEnabled(cfg), operatingSystem, cfg) if err != nil { return nil, errors.New(err, "could not start the HTTP server for the API") } - s.Start() - // return server stopper - return s.Stop, nil + return s, nil } func isProcessStatsEnabled(cfg *monitoringCfg.MonitoringConfig) bool { diff --git a/pkg/component/runtime/command.go b/pkg/component/runtime/command.go index df02656b3f2..140e9d22557 100644 --- a/pkg/component/runtime/command.go +++ b/pkg/component/runtime/command.go @@ -422,7 +422,7 @@ func (c *commandRuntime) startWatcher(info *process.Info, comm Communicator) { go func() { err := comm.WriteConnInfo(info.Stdin) if err != nil { - c.forceCompState(client.UnitStateFailed, fmt.Sprintf("Failed: failed to provide connection information to spawned pid '%d': %s", info.PID, err)) + _, _ = c.logErr.Write([]byte(fmt.Sprintf("Failed: failed to provide connection information to spawned pid '%d': %s", info.PID, err))) // kill instantly _ = info.Kill() } else { diff --git a/pkg/component/runtime/manager_test.go b/pkg/component/runtime/manager_test.go index cbce15b13c5..010c9f0478f 100644 --- a/pkg/component/runtime/manager_test.go +++ b/pkg/component/runtime/manager_test.go @@ -2481,18 +2481,21 @@ func TestManager_FakeInput_RestartsOnMissedCheckins(t *testing.T) { return case state := <-sub.Ch(): t.Logf("component state changed: %+v", state) - if state.State == client.UnitStateStarting || state.State == client.UnitStateHealthy { + + switch state.State { + case client.UnitStateStarting: + case client.UnitStateHealthy: // starting and healthy are allowed - } else if state.State == client.UnitStateDegraded { + case client.UnitStateDegraded: // should go to degraded first wasDegraded = true - } else if state.State == client.UnitStateFailed { + case client.UnitStateFailed: if wasDegraded { subErrCh <- nil } else { subErrCh <- errors.New("should have been degraded before failed") } - } else { + default: subErrCh <- fmt.Errorf("unknown component state: %v", state.State) } } diff --git a/pkg/component/runtime/state.go b/pkg/component/runtime/state.go index 37392b6d4bc..203f453fa76 100644 --- a/pkg/component/runtime/state.go +++ b/pkg/component/runtime/state.go @@ -462,7 +462,9 @@ func (s *ComponentState) cleanupStopped() bool { return cleaned } -// forceState force updates the state for the entire component, forcing that state on all units. +// forceState force updates the state for the entire component, forcing that +// state on all units. It returns true if either the component state or any of +// the units state changed, false otherwise. func (s *ComponentState) forceState(state client.UnitState, msg string) bool { changed := false if s.State != state || s.Message != msg {