diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl index 99305a43ce1..784b538ab2d 100644 --- a/_meta/config/common.p2.yml.tmpl +++ b/_meta/config/common.p2.yml.tmpl @@ -60,6 +60,8 @@ inputs: # logs: true # # enables metrics monitoring # metrics: true +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -77,7 +79,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl index 317c1126375..9a2e73fe31c 100644 --- a/_meta/config/common.reference.p2.yml.tmpl +++ b/_meta/config/common.reference.p2.yml.tmpl @@ -139,6 +139,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -156,7 +158,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/_meta/config/elastic-agent.docker.yml.tmpl b/_meta/config/elastic-agent.docker.yml.tmpl index bc8ebee12f0..10f9c4147c8 100644 --- a/_meta/config/elastic-agent.docker.yml.tmpl +++ b/_meta/config/elastic-agent.docker.yml.tmpl @@ -18,17 +18,17 @@ inputs: data_stream.namespace: default use_output: default streams: - - metricsets: + - metricsets: - cpu # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes data_stream.dataset: system.cpu - - metricsets: + - metricsets: - memory data_stream.dataset: system.memory - - metricsets: + - metricsets: - network data_stream.dataset: system.network - - metricsets: + - metricsets: - filesystem data_stream.dataset: system.filesystem @@ -112,6 +112,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -127,7 +129,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml b/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml new file mode 100644 index 00000000000..dd7c7c3557e --- /dev/null +++ b/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Allow configuring `agent.monitoring.metrics_period`. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/4961 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/elastic-agent.docker.yml b/elastic-agent.docker.yml index eac46becfd8..a0a6d5da8f6 100644 --- a/elastic-agent.docker.yml +++ b/elastic-agent.docker.yml @@ -18,17 +18,17 @@ inputs: data_stream.namespace: default use_output: default streams: - - metricsets: + - metricsets: - cpu # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes data_stream.dataset: system.cpu - - metricsets: + - metricsets: - memory data_stream.dataset: system.memory - - metricsets: + - metricsets: - network data_stream.dataset: system.network - - metricsets: + - metricsets: - filesystem data_stream.dataset: system.filesystem @@ -112,6 +112,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -127,7 +129,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml index 7565fe2f76a..fa3f2429fe5 100644 --- a/elastic-agent.reference.yml +++ b/elastic-agent.reference.yml @@ -145,6 +145,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -162,7 +164,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/elastic-agent.yml b/elastic-agent.yml index 0b4e2d38e3f..ae70df26213 100644 --- a/elastic-agent.yml +++ b/elastic-agent.yml @@ -66,6 +66,8 @@ inputs: # logs: true # # enables metrics monitoring # metrics: true +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false @@ -83,7 +85,7 @@ inputs: # # The possible values for `failon` are: # # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive. # # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive. -# # `heartbeat`: return an error only if the agent coordinator is unresponsive. +# # `heartbeat`: return an error only if the agent coordinator is unresponsive. # # If no `failon` parameter is provided, the default behavior is `failon=heartbeat` # http: # # enables http endpoint diff --git a/internal/pkg/agent/application/coordinator/diagnostics_test.go b/internal/pkg/agent/application/coordinator/diagnostics_test.go index 8d0b65e0df1..3dc45615bfa 100644 --- a/internal/pkg/agent/application/coordinator/diagnostics_test.go +++ b/internal/pkg/agent/application/coordinator/diagnostics_test.go @@ -97,6 +97,7 @@ agent: http: null logs: false metrics: false + metrics_period: "" namespace: "" pprof: null traces: true diff --git a/internal/pkg/agent/application/monitoring/v1_monitor.go b/internal/pkg/agent/application/monitoring/v1_monitor.go index ce6bda03fcc..d2947d84521 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor.go @@ -47,6 +47,7 @@ const ( agentKey = "agent" monitoringKey = "monitoring" useOutputKey = "use_output" + monitoringMetricsPeriodKey = "metrics_period" monitoringOutput = "monitoring" defaultMonitoringNamespace = "default" agentName = "elastic-agent" @@ -58,7 +59,7 @@ const ( // metricset execution period used for the monitoring metrics inputs // we set this to 60s to reduce the load/data volume on the monitoring cluster - metricsCollectionInterval = 60 * time.Second + defaultMetricsCollectionInterval = 60 * time.Second ) var ( @@ -129,6 +130,7 @@ func (b *BeatsMonitor) MonitoringConfig( cfg := make(map[string]interface{}) monitoringOutputName := defaultOutputName + metricsCollectionIntervalString := b.config.C.MetricsPeriod if agentCfg, found := policy[agentKey]; found { // The agent section is required for feature flags cfg[agentKey] = agentCfg @@ -143,6 +145,12 @@ func (b *BeatsMonitor) MonitoringConfig( monitoringOutputName = useStr } } + + if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found { + if metricsPeriodStr, ok := metricsPeriod.(string); ok { + metricsCollectionIntervalString = metricsPeriodStr + } + } } } } @@ -165,7 +173,7 @@ func (b *BeatsMonitor) MonitoringConfig( } if b.config.C.MonitorMetrics { - if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap); err != nil { + if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap, metricsCollectionIntervalString); err != nil { return nil, errors.New(err, "failed to inject monitoring output") } } @@ -542,8 +550,16 @@ func (b *BeatsMonitor) monitoringNamespace() string { } // injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object. -func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, existingStateServicePids map[string]uint64) error { - metricsCollectionIntervalString := metricsCollectionInterval.String() +func (b *BeatsMonitor) injectMetricsInput( + cfg map[string]interface{}, + componentIDToBinary map[string]string, + componentList []component.Component, + existingStateServicePids map[string]uint64, + metricsCollectionIntervalString string, +) error { + if metricsCollectionIntervalString == "" { + metricsCollectionIntervalString = defaultMetricsCollectionInterval.String() + } monitoringNamespace := b.monitoringNamespace() fixedAgentName := strings.ReplaceAll(agentName, "-", "_") beatsStreams := make([]interface{}, 0, len(componentIDToBinary)) diff --git a/internal/pkg/agent/application/monitoring/v1_monitor_test.go b/internal/pkg/agent/application/monitoring/v1_monitor_test.go index b3b9a633aa4..210271dc3d7 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor_test.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor_test.go @@ -116,75 +116,145 @@ func TestMonitoringConfigMetricsInterval(t *testing.T) { agentInfo, err := info.NewAgentInfo(context.Background(), false) require.NoError(t, err, "Error creating agent info") - mCfg := &monitoringConfig{ - C: &monitoringcfg.MonitoringConfig{ - Enabled: true, - MonitorMetrics: true, - HTTP: &monitoringcfg.MonitoringHTTPConfig{ - Enabled: false, + tcs := []struct { + name string + monitoringCfg *monitoringConfig + policy map[string]any + expectedInterval time.Duration + }{ + { + name: "default metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + }, + }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, + }, }, + expectedInterval: defaultMetricsCollectionInterval, }, - } - - policy := map[string]any{ - "agent": map[string]any{ - "monitoring": map[string]any{ - "metrics": true, - "http": map[string]any{ - "enabled": false, + { + name: "agent config metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + MetricsPeriod: "20s", + }, + }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, }, }, + expectedInterval: 20 * time.Second, }, - "outputs": map[string]any{ - "default": map[string]any{}, + { + name: "policy metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + MetricsPeriod: "20s", + }, + }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + "metrics_period": "10s", + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, + }, + }, + expectedInterval: 10 * time.Second, }, } - b := &BeatsMonitor{ - enabled: true, - config: mCfg, - operatingSystem: runtime.GOOS, - agentInfo: agentInfo, - } - got, err := b.MonitoringConfig(policy, nil, map[string]string{"foobeat": "filebeat"}, map[string]uint64{}) // put a componentID/binary mapping to have something in the beats monitoring input - assert.NoError(t, err) - - rawInputs, ok := got["inputs"] - require.True(t, ok, "monitoring config contains no input") - inputs, ok := rawInputs.([]any) - require.True(t, ok, "monitoring inputs are not a list") - marshaledInputs, err := yaml.Marshal(inputs) - if assert.NoError(t, err, "error marshaling monitoring inputs") { - t.Logf("marshaled monitoring inputs:\n%s\n", marshaledInputs) - } - // loop over the created inputs - for _, i := range inputs { - input, ok := i.(map[string]any) - if assert.Truef(t, ok, "input is not represented as a map: %v", i) { - inputID := input["id"] - t.Logf("input %q", inputID) - // check the streams created for the input, should be a list of objects - if assert.Contains(t, input, "streams", "input %q does not contain any stream", inputID) && - assert.IsTypef(t, []any{}, input["streams"], "streams for input %q are not a list of objects", inputID) { - // loop over streams and cast to map[string]any to access keys - for _, rawStream := range input["streams"].([]any) { - if assert.IsTypef(t, map[string]any{}, rawStream, "stream %v for input %q is not a map", rawStream, inputID) { - stream := rawStream.(map[string]any) - // check period and assert its value - streamID := stream["id"] - if assert.Containsf(t, stream, "period", "stream %q for input %q does not contain a period", streamID, inputID) && - assert.IsType(t, "", stream["period"], "period for stream %q of input %q is not represented as a string", streamID, inputID) { - periodString := stream["period"].(string) - duration, err := time.ParseDuration(periodString) - if assert.NoErrorf(t, err, "Unparseable period duration %s for stream %q of input %q", periodString, streamID, inputID) { - assert.Equalf(t, duration, 60*time.Second, "unexpected duration for stream %q of input %q", streamID, inputID) + for _, tc := range tcs { + + t.Run(tc.name, func(t *testing.T) { + b := &BeatsMonitor{ + enabled: true, + config: tc.monitoringCfg, + operatingSystem: runtime.GOOS, + agentInfo: agentInfo, + } + got, err := b.MonitoringConfig(tc.policy, nil, map[string]string{"foobeat": "filebeat"}, map[string]uint64{}) // put a componentID/binary mapping to have something in the beats monitoring input + assert.NoError(t, err) + + rawInputs, ok := got["inputs"] + require.True(t, ok, "monitoring config contains no input") + inputs, ok := rawInputs.([]any) + require.True(t, ok, "monitoring inputs are not a list") + marshaledInputs, err := yaml.Marshal(inputs) + if assert.NoError(t, err, "error marshaling monitoring inputs") { + t.Logf("marshaled monitoring inputs:\n%s\n", marshaledInputs) + } + + // loop over the created inputs + for _, i := range inputs { + input, ok := i.(map[string]any) + if assert.Truef(t, ok, "input is not represented as a map: %v", i) { + inputID := input["id"] + t.Logf("input %q", inputID) + // check the streams created for the input, should be a list of objects + if assert.Contains(t, input, "streams", "input %q does not contain any stream", inputID) && + assert.IsTypef(t, []any{}, input["streams"], "streams for input %q are not a list of objects", inputID) { + // loop over streams and cast to map[string]any to access keys + for _, rawStream := range input["streams"].([]any) { + if assert.IsTypef(t, map[string]any{}, rawStream, "stream %v for input %q is not a map", rawStream, inputID) { + stream := rawStream.(map[string]any) + // check period and assert its value + streamID := stream["id"] + if assert.Containsf(t, stream, "period", "stream %q for input %q does not contain a period", streamID, inputID) && + assert.IsType(t, "", stream["period"], "period for stream %q of input %q is not represented as a string", streamID, inputID) { + periodString := stream["period"].(string) + duration, err := time.ParseDuration(periodString) + if assert.NoErrorf(t, err, "Unparseable period duration %s for stream %q of input %q", periodString, streamID, inputID) { + assert.Equalf(t, duration, tc.expectedInterval, "unexpected duration for stream %q of input %q", streamID, inputID) + } + } } } } } } - } - + }) } } diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index 3fb31918c08..20b2434be83 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -24,6 +24,7 @@ type MonitoringConfig struct { Enabled bool `yaml:"enabled" config:"enabled"` MonitorLogs bool `yaml:"logs" config:"logs"` MonitorMetrics bool `yaml:"metrics" config:"metrics"` + MetricsPeriod string `yaml:"metrics_period" config:"metrics_period"` LogMetrics bool `yaml:"-" config:"-"` HTTP *MonitoringHTTPConfig `yaml:"http" config:"http"` Namespace string `yaml:"namespace" config:"namespace"`