elastic · fearful-symmetry · Jun 16, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: feature
+
+# Change summary; a 80ish characters long description of the change.
+summary: Add monitoring of endpoint usage metrics
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component:
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/4789
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+issue: https://github.com/elastic/elastic-agent/issues/4083
@@ -10,6 +10,7 @@ import (
 	"fmt"
 	"reflect"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	"github.com/hashicorp/go-multierror"
@@ -82,7 +83,7 @@ type MonitorManager interface {
 	Reload(rawConfig *config.Config) error
 
 	// MonitoringConfig injects monitoring configuration into resolved ast tree.
-	MonitoringConfig(map[string]interface{}, []component.Component, map[string]string) (map[string]interface{}, error)
+	MonitoringConfig(map[string]interface{}, []component.Component, map[string]string, map[string]uint64) (map[string]interface{}, error)
 }
 
 // Runner provides interface to run a manager and receive running errors.
@@ -284,6 +285,14 @@ type Coordinator struct {
 	// loop in runLoopIteration() is active and listening.
 	// Should only be interacted with via CoordinatorActive() or runLoopIteration()
 	heartbeatChan chan struct{}
+
+	// if a component (mostly endpoint) has a new PID, we need to update
+	// the monitoring components so they have a PID to monitor
+	// however, if endpoint is in some kind of restart loop,
+	// we could DOS the config system. Instead,
+	// run a ticker that checks to see if we have a new PID.
+	componentPIDTicker         *time.Ticker
+	componentPidRequiresUpdate *atomic.Bool
 }
 
 // The channels Coordinator reads to receive updates from the various managers.
@@ -374,10 +383,12 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
 		// synchronization in the subscriber API, just set the input buffer to 0.
 		stateBroadcaster: broadcaster.New(state, 64, 32),
 
-		logLevelCh:         make(chan logp.Level),
-		overrideStateChan:  make(chan *coordinatorOverrideState),
-		upgradeDetailsChan: make(chan *details.Details),
-		heartbeatChan:      make(chan struct{}),
+		logLevelCh:                 make(chan logp.Level),
+		overrideStateChan:          make(chan *coordinatorOverrideState),
+		upgradeDetailsChan:         make(chan *details.Details),
+		heartbeatChan:              make(chan struct{}),
+		componentPIDTicker:         time.NewTicker(time.Second * 30),
+		componentPidRequiresUpdate: &atomic.Bool{},
 	}
 	// Setup communication channels for any non-nil components. This pattern
 	// lets us transparently accept nil managers / simulated events during
@@ -926,6 +937,8 @@ func (c *Coordinator) runner(ctx context.Context) error {
 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
+	defer c.componentPIDTicker.Stop()
+
 	// We run nil checks before starting the various managers so that unit tests
 	// only have to initialize / mock the specific components they're testing.
 	// If a manager is nil, we prebuffer its return channel with nil also so
@@ -1038,6 +1051,19 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 
 	case c.heartbeatChan <- struct{}{}:
 
+	case <-c.componentPIDTicker.C:
+		// if we hit the ticker and we've got a new PID,
+		// reload the component model
+		if c.componentPidRequiresUpdate.Load() {
+			c.componentPidRequiresUpdate.Store(false)
+			err := c.refreshComponentModel(ctx)
+			if err != nil {
+				err = fmt.Errorf("error refreshing component model for PID update: %w", err)
+				c.setConfigManagerError(err)
+				c.logger.Errorf("%s", err)
+			}
+		}
+
 	case componentState := <-c.managerChans.runtimeManagerUpdate:
 		// New component change reported by the runtime manager via
 		// Coordinator.watchRuntimeComponents(), merge it with the
@@ -1277,11 +1303,17 @@ func (c *Coordinator) generateComponentModel() (err error) {
 		configInjector = c.monitorMgr.MonitoringConfig
 	}
 
+	var existingCompState = make(map[string]uint64, len(c.state.Components))
+	for _, comp := range c.state.Components {
+		existingCompState[comp.Component.ID] = comp.State.Pid
+	}
+
 	comps, err := c.specs.ToComponents(
 		cfg,
 		configInjector,
 		c.state.LogLevel,
 		c.agentInfo,
+		existingCompState,
 	)
 	if err != nil {
 		return fmt.Errorf("failed to render components: %w", err)

@@ -135,16 +135,25 @@ func (c *Coordinator) refreshState() {
 // Coordinator state and sets stateNeedsRefresh.
 // Must be called on the main Coordinator goroutine.
 func (c *Coordinator) applyComponentState(state runtime.ComponentComponentState) {
+
+	// check for any component updates to the known PID, so we can update the component monitoring
 	found := false
+	pidRequiresUpdate := false
 	for i, other := range c.state.Components {
 		if other.Component.ID == state.Component.ID {
+			if other.State.Pid != state.State.Pid {
+				pidRequiresUpdate = true
+			}
 			c.state.Components[i] = state
 			found = true
 			break
 		}
 	}
 	if !found {
 		c.state.Components = append(c.state.Components, state)
+		if state.State.Pid != 0 {
+			pidRequiresUpdate = true
+		}
 	}
 
 	// In the case that the component has stopped, it is now removed.
@@ -160,6 +169,10 @@ func (c *Coordinator) applyComponentState(state runtime.ComponentComponentState)
 	}
 
 	c.stateNeedsRefresh = true
+
+	if pidRequiresUpdate {
+		c.componentPidRequiresUpdate.Store(true)
+	}
 }
 
 // generateReportableState aggregates the internal state of the Coordinator

@@ -587,7 +587,6 @@ func TestCoordinator_StateSubscribeIsolatedUnits(t *testing.T) {
 				resultChan <- ctx.Err()
 				return
 			case state := <-subChan:
-				t.Logf("%+v", state)
 				if len(state.Components) == 3 {
 					compState0 := getComponentState(state.Components, "fake-isolated-units-default-fake-isolated-units-0")
 					compState1 := getComponentState(state.Components, "fake-isolated-units-default-fake-isolated-units-1")
@@ -599,6 +598,11 @@ func TestCoordinator_StateSubscribeIsolatedUnits(t *testing.T) {
 								(unit1.State == client.UnitStateHealthy && unit1.Message == "Healthy From Fake Isolated Units 1 Config") {
 								resultChan <- nil
 								return
+							} else if unit0.State == client.UnitStateFailed && unit1.State == client.UnitStateFailed {
+								// if you get a really strange failed state, check to make sure the mock binaries in
+								// elastic-agent/pkg/component/fake/ are updated
+								t.Fail()
+								t.Logf("got units with failed state: %#v / %#v", unit1, unit0)
 							}
 						}
 					}
@@ -1004,7 +1008,7 @@ func (*testMonitoringManager) Prepare(_ string) error
 func (*testMonitoringManager) Cleanup(string) error                                  { return nil }
 func (*testMonitoringManager) Enabled() bool                                         { return false }
 func (*testMonitoringManager) Reload(rawConfig *config.Config) error                 { return nil }
-func (*testMonitoringManager) MonitoringConfig(_ map[string]interface{}, _ []component.Component, _ map[string]string) (map[string]interface{}, error) {
+func (*testMonitoringManager) MonitoringConfig(_ map[string]interface{}, _ []component.Component, _ map[string]string, _ map[string]uint64) (map[string]interface{}, error) {
 	return nil, nil
 }
 

@@ -59,6 +59,7 @@ func TestVarsManagerError(t *testing.T) {
 		managerChans: managerChans{
 			varsManagerError: varsErrorChan,
 		},
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 	// Send an error via the vars manager channel, and let Coordinator update
 	const errorStr = "force error"
@@ -110,6 +111,7 @@ func TestCoordinatorReportsUnhealthyComponents(t *testing.T) {
 		managerChans: managerChans{
 			runtimeManagerUpdate: runtimeChan,
 		},
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	unhealthyComponent := runtime.ComponentComponentState{
@@ -186,6 +188,7 @@ func TestCoordinatorComponentStatesAreSeparate(t *testing.T) {
 		managerChans: managerChans{
 			runtimeManagerUpdate: runtimeChan,
 		},
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	comp1 := runtime.ComponentComponentState{
@@ -256,6 +259,7 @@ func TestCoordinatorReportsUnhealthyUnits(t *testing.T) {
 		managerChans: managerChans{
 			runtimeManagerUpdate: runtimeChan,
 		},
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// Create a healthy component with healthy input and output units
@@ -375,8 +379,9 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
 		runtimeMgr: &fakeRuntimeManager{},
 
 		// Set valid but empty initial values for ast and vars
-		vars: emptyVars(t),
-		ast:  emptyAST(t),
+		vars:               emptyVars(t),
+		ast:                emptyAST(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// Send an invalid config update and confirm that Coordinator reports
@@ -389,7 +394,6 @@ agent.download.sourceURI:
 	cfgChange := &configChange{cfg: cfg}
 	configChan <- cfgChange
 	coord.runLoopIteration(ctx)
-
 	assert.True(t, cfgChange.failed, "Policy with invalid field should have reported failed config change")
 	require.ErrorContainsf(t,
 		cfgChange.err,
@@ -420,6 +424,7 @@ agent.download.sourceURI:
 	// (This check is based on a previous bug in which a vars update could
 	// discard active policy errors.)
 	varsChan <- emptyVars(t)
+	t.Logf("after emptyVars statement")
 	coord.runLoopIteration(ctx)
 
 	assert.Error(t, coord.configErr, "Vars update shouldn't affect configErr")
@@ -489,8 +494,9 @@ func TestCoordinatorReportsComponentModelError(t *testing.T) {
 		runtimeMgr: &fakeRuntimeManager{},
 
 		// Set valid but empty initial values for ast and vars
-		vars: emptyVars(t),
-		ast:  emptyAST(t),
+		vars:               emptyVars(t),
+		ast:                emptyAST(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// This configuration produces a valid AST but its EQL condition is
@@ -583,8 +589,9 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) {
 		managerChans: managerChans{
 			configManagerUpdate: configChan,
 		},
-		runtimeMgr: runtimeManager,
-		vars:       emptyVars(t),
+		runtimeMgr:         runtimeManager,
+		vars:               emptyVars(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 	coord.RegisterMonitoringServer(monitoringReloader)
 
@@ -711,8 +718,9 @@ func TestCoordinatorPolicyChangeUpdatesRuntimeManager(t *testing.T) {
 		managerChans: managerChans{
 			configManagerUpdate: configChan,
 		},
-		runtimeMgr: runtimeManager,
-		vars:       emptyVars(t),
+		runtimeMgr:         runtimeManager,
+		vars:               emptyVars(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// Create a policy with one input and one output
@@ -798,8 +806,9 @@ func TestCoordinatorReportsRuntimeManagerUpdateFailure(t *testing.T) {
 			// manager, so it receives the update result.
 			runtimeManagerError: updateErrChan,
 		},
-		runtimeMgr: runtimeManager,
-		vars:       emptyVars(t),
+		runtimeMgr:         runtimeManager,
+		vars:               emptyVars(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// Send an empty policy which should forward an empty component model to
@@ -860,8 +869,9 @@ func TestCoordinatorAppliesVarsToPolicy(t *testing.T) {
 			configManagerUpdate: configChan,
 			varsManagerUpdate:   varsChan,
 		},
-		runtimeMgr: runtimeManager,
-		vars:       emptyVars(t),
+		runtimeMgr:         runtimeManager,
+		vars:               emptyVars(t),
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 
 	// Create a policy with one input and one output
@@ -936,7 +946,8 @@ func TestCoordinatorReportsOverrideState(t *testing.T) {
 		stateBroadcaster: &broadcaster.Broadcaster[State]{
 			InputChan: stateChan,
 		},
-		overrideStateChan: overrideStateChan,
+		overrideStateChan:  overrideStateChan,
+		componentPIDTicker: time.NewTicker(time.Second * 30),
 	}
 	// Send an error via the vars manager channel, and let Coordinator update
 	overrideStateChan <- &coordinatorOverrideState{

@@ -488,6 +488,7 @@ log_level: "warning"
 components:
   - id: "comp-1"
     state:
+      pid: 0
       state: 3
       message: "degraded message"
       features_idx: 0
@@ -570,6 +571,7 @@ log_level: "warning"
 components:
   - id: "comp-1"
     state:
+      pid: 0
       state: 3
       message: "degraded message"
       features_idx: 0