diff --git a/.chloggen/fix_supervisor-reports-error-nop-config.yaml b/.chloggen/fix_supervisor-reports-error-nop-config.yaml new file mode 100644 index 000000000000..0c3891fbf273 --- /dev/null +++ b/.chloggen/fix_supervisor-reports-error-nop-config.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: opampsupervisor + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Report an 'Applied' remote config status when an empty config is received. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [36682] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/cmd/opampsupervisor/e2e_test.go b/cmd/opampsupervisor/e2e_test.go index 0ce4f8880991..96935b4bd0d0 100644 --- a/cmd/opampsupervisor/e2e_test.go +++ b/cmd/opampsupervisor/e2e_test.go @@ -1562,6 +1562,23 @@ func TestSupervisorRemoteConfigApplyStatus(t *testing.T) { status, ok := remoteConfigStatus.Load().(*protobufs.RemoteConfigStatus) return ok && status.Status == protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED }, 15*time.Second, 100*time.Millisecond, "Remote config status was not set to FAILED for bad config") + + // Test with nop configuration + emptyHash := sha256.Sum256([]byte{}) + server.sendToSupervisor(&protobufs.ServerToAgent{ + RemoteConfig: &protobufs.AgentRemoteConfig{ + Config: &protobufs.AgentConfigMap{ + ConfigMap: map[string]*protobufs.AgentConfigFile{}, + }, + ConfigHash: emptyHash[:], + }, + }) + + // Check that the status is set to APPLIED + require.Eventually(t, func() bool { + status, ok := remoteConfigStatus.Load().(*protobufs.RemoteConfigStatus) + return ok && status.Status == protobufs.RemoteConfigStatuses_RemoteConfigStatuses_APPLIED + }, 5*time.Second, 10*time.Millisecond, "Remote config status was not set to APPLIED for empty config") } func TestSupervisorOpAmpServerPort(t *testing.T) { diff --git a/cmd/opampsupervisor/supervisor/supervisor.go b/cmd/opampsupervisor/supervisor/supervisor.go index e446805efb45..4607aa6d5958 100644 --- a/cmd/opampsupervisor/supervisor/supervisor.go +++ b/cmd/opampsupervisor/supervisor/supervisor.go @@ -78,6 +78,13 @@ func (c *configState) equal(other *configState) bool { return other.mergedConfig == c.mergedConfig && other.configMapIsEmpty == c.configMapIsEmpty } +type agentStartStatus string + +var ( + agentStarting agentStartStatus = "starting" + agentNotStarting agentStartStatus = "notStarting" +) + // Supervisor implements supervising of OpenTelemetry Collector and uses OpAMPClient // to work with an OpAMP Server. type Supervisor struct { @@ -1000,22 +1007,27 @@ func (s *Supervisor) handleRestartCommand() error { return err } -func (s *Supervisor) startAgent() { +func (s *Supervisor) startAgent() (agentStartStatus, error) { if s.cfgState.Load().(*configState).configMapIsEmpty { // Don't start the agent if there is no config to run s.logger.Info("No config present, not starting agent.") - return + // need to manually trigger updating effective config + err := s.opampClient.UpdateEffectiveConfig(context.Background()) + if err != nil { + s.logger.Error("The OpAMP client failed to update the effective config", zap.Error(err)) + } + return agentNotStarting, nil } err := s.commander.Start(context.Background()) if err != nil { s.logger.Error("Cannot start the agent", zap.Error(err)) - err = s.opampClient.SetHealth(&protobufs.ComponentHealth{Healthy: false, LastError: fmt.Sprintf("Cannot start the agent: %v", err)}) + startErr := fmt.Errorf("Cannot start the agent: %w", err) + err = s.opampClient.SetHealth(&protobufs.ComponentHealth{Healthy: false, LastError: startErr.Error()}) if err != nil { s.logger.Error("Failed to report OpAMP client health", zap.Error(err)) } - - return + return "", startErr } s.agentHasStarted = false @@ -1024,6 +1036,7 @@ func (s *Supervisor) startAgent() { s.startHealthCheckTicker() s.healthChecker = healthchecker.NewHTTPHealthChecker(fmt.Sprintf("http://%s", s.agentHealthCheckEndpoint)) + return agentStarting, nil } func (s *Supervisor) startHealthCheckTicker() { @@ -1090,7 +1103,11 @@ func (s *Supervisor) runAgentProcess() { if _, err := os.Stat(s.agentConfigFilePath()); err == nil { // We have an effective config file saved previously. Use it to start the agent. s.logger.Debug("Effective config found, starting agent initial time") - s.startAgent() + _, err := s.startAgent() + if err != nil { + s.logger.Error("starting agent failed", zap.Error(err)) + s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error()) + } } restartTimer := time.NewTimer(0) @@ -1114,7 +1131,17 @@ func (s *Supervisor) runAgentProcess() { s.logger.Debug("Restarting agent due to new config") restartTimer.Stop() s.stopAgentApplyConfig() - s.startAgent() + status, err := s.startAgent() + if err != nil { + s.logger.Error("starting agent with new config failed", zap.Error(err)) + s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error()) + } + + if status == agentNotStarting { + // not starting agent because of nop config, clear timer + configApplyTimeoutTimer.Stop() + s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_APPLIED, "") + } case <-s.commander.Exited(): // the agent process exit is expected for restart command and will not attempt to restart @@ -1146,7 +1173,11 @@ func (s *Supervisor) runAgentProcess() { case <-restartTimer.C: s.logger.Debug("Agent starting after start backoff") - s.startAgent() + _, err := s.startAgent() + if err != nil { + s.logger.Error("restarting agent failed", zap.Error(err)) + s.reportConfigStatus(protobufs.RemoteConfigStatuses_RemoteConfigStatuses_FAILED, err.Error()) + } case <-configApplyTimeoutTimer.C: if s.lastHealthFromClient == nil || !s.lastHealthFromClient.Healthy { @@ -1242,6 +1273,9 @@ func (s *Supervisor) saveLastReceivedOwnTelemetrySettings(set *protobufs.Telemet } func (s *Supervisor) reportConfigStatus(status protobufs.RemoteConfigStatuses, errorMessage string) { + if !s.config.Capabilities.ReportsRemoteConfig { + s.logger.Debug("supervisor is not configured to report remote config status") + } err := s.opampClient.SetRemoteConfigStatus(&protobufs.RemoteConfigStatus{ LastRemoteConfigHash: s.remoteConfig.GetConfigHash(), Status: status,