From fce8a9c6015e0021d235fbc6ce405e43707dfc7c Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 26 Jun 2024 09:19:02 -0400 Subject: [PATCH] [8.14](backport #4961) Introduce agent.monitoring.metrics_period (#5003) * Introduce agent.monitoring.metrics_period (#4961) * feat: introduce agent.monitoring.metrics_period * doc: add changelog/fragments * fix: TestDiagnosticLocalConfig unit-test * doc: reword summary in changelog fragment (cherry picked from commit 6a452564dd42ad192e68224d7177377d9ab25cc7) # Conflicts: # _meta/config/common.p2.yml.tmpl # _meta/config/common.reference.p2.yml.tmpl # _meta/config/elastic-agent.docker.yml.tmpl # elastic-agent.docker.yml # elastic-agent.reference.yml # elastic-agent.yml # internal/pkg/agent/application/monitoring/v1_monitor.go # internal/pkg/agent/application/monitoring/v1_monitor_test.go * fix conflicts --------- Co-authored-by: Panos Koutsovasilis --- _meta/config/common.p2.yml.tmpl | 2 + _meta/config/common.reference.p2.yml.tmpl | 2 + _meta/config/elastic-agent.docker.yml.tmpl | 10 +- ...4-support-monitoring-metrics-interval.yaml | 32 +++ elastic-agent.docker.yml | 10 +- elastic-agent.reference.yml | 2 + elastic-agent.yml | 2 + .../coordinator/diagnostics_test.go | 1 + .../application/monitoring/v1_monitor.go | 20 +- .../application/monitoring/v1_monitor_test.go | 182 ++++++++++++------ internal/pkg/core/monitoring/config/config.go | 1 + 11 files changed, 196 insertions(+), 68 deletions(-) create mode 100644 changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl index 7ca36f155a1..53fb5bbd16e 100644 --- a/_meta/config/common.p2.yml.tmpl +++ b/_meta/config/common.p2.yml.tmpl @@ -60,6 +60,8 @@ inputs: # logs: true # # enables metrics monitoring # metrics: true +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl index 325cd13ab18..381df860859 100644 --- a/_meta/config/common.reference.p2.yml.tmpl +++ b/_meta/config/common.reference.p2.yml.tmpl @@ -139,6 +139,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/_meta/config/elastic-agent.docker.yml.tmpl b/_meta/config/elastic-agent.docker.yml.tmpl index 134aecf3249..4cad4db7a67 100644 --- a/_meta/config/elastic-agent.docker.yml.tmpl +++ b/_meta/config/elastic-agent.docker.yml.tmpl @@ -18,17 +18,17 @@ inputs: data_stream.namespace: default use_output: default streams: - - metricsets: + - metricsets: - cpu # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes data_stream.dataset: system.cpu - - metricsets: + - metricsets: - memory data_stream.dataset: system.memory - - metricsets: + - metricsets: - network data_stream.dataset: system.network - - metricsets: + - metricsets: - filesystem data_stream.dataset: system.filesystem @@ -112,6 +112,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml b/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml new file mode 100644 index 00000000000..dd7c7c3557e --- /dev/null +++ b/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: feature + +# Change summary; a 80ish characters long description of the change. +summary: Allow configuring `agent.monitoring.metrics_period`. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/4961 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/elastic-agent.docker.yml b/elastic-agent.docker.yml index 87aa2cdd0cd..07bbedabaa5 100644 --- a/elastic-agent.docker.yml +++ b/elastic-agent.docker.yml @@ -18,17 +18,17 @@ inputs: data_stream.namespace: default use_output: default streams: - - metricsets: + - metricsets: - cpu # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes data_stream.dataset: system.cpu - - metricsets: + - metricsets: - memory data_stream.dataset: system.memory - - metricsets: + - metricsets: - network data_stream.dataset: system.network - - metricsets: + - metricsets: - filesystem data_stream.dataset: system.filesystem @@ -112,6 +112,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml index 7ee19406674..03232c45a07 100644 --- a/elastic-agent.reference.yml +++ b/elastic-agent.reference.yml @@ -145,6 +145,8 @@ inputs: # logs: false # # enables metrics monitoring # metrics: false +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/elastic-agent.yml b/elastic-agent.yml index 342fd50f432..23255367709 100644 --- a/elastic-agent.yml +++ b/elastic-agent.yml @@ -66,6 +66,8 @@ inputs: # logs: true # # enables metrics monitoring # metrics: true +# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds. +# metrics_period: 60s # # exposes /debug/pprof/ endpoints # # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost # pprof.enabled: false diff --git a/internal/pkg/agent/application/coordinator/diagnostics_test.go b/internal/pkg/agent/application/coordinator/diagnostics_test.go index cd2b6e9dcae..4fa389304b6 100644 --- a/internal/pkg/agent/application/coordinator/diagnostics_test.go +++ b/internal/pkg/agent/application/coordinator/diagnostics_test.go @@ -97,6 +97,7 @@ agent: http: null logs: false metrics: false + metrics_period: "" namespace: "" pprof: null traces: true diff --git a/internal/pkg/agent/application/monitoring/v1_monitor.go b/internal/pkg/agent/application/monitoring/v1_monitor.go index 82fce6ff29b..fb382c21cdf 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor.go @@ -47,6 +47,7 @@ const ( agentKey = "agent" monitoringKey = "monitoring" useOutputKey = "use_output" + monitoringMetricsPeriodKey = "metrics_period" monitoringOutput = "monitoring" defaultMonitoringNamespace = "default" agentName = "elastic-agent" @@ -58,7 +59,7 @@ const ( // metricset execution period used for the monitoring metrics inputs // we set this to 60s to reduce the load/data volume on the monitoring cluster - metricsCollectionInterval = 60 * time.Second + defaultMetricsCollectionInterval = 60 * time.Second ) var ( @@ -122,6 +123,7 @@ func (b *BeatsMonitor) MonitoringConfig( cfg := make(map[string]interface{}) monitoringOutputName := defaultOutputName + metricsCollectionIntervalString := b.config.C.MetricsPeriod if agentCfg, found := policy[agentKey]; found { // The agent section is required for feature flags cfg[agentKey] = agentCfg @@ -136,6 +138,12 @@ func (b *BeatsMonitor) MonitoringConfig( monitoringOutputName = useStr } } + + if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found { + if metricsPeriodStr, ok := metricsPeriod.(string); ok { + metricsCollectionIntervalString = metricsPeriodStr + } + } } } } @@ -158,7 +166,7 @@ func (b *BeatsMonitor) MonitoringConfig( } if b.config.C.MonitorMetrics { - if err := b.injectMetricsInput(cfg, componentIDToBinary, monitoringOutput, components); err != nil { + if err := b.injectMetricsInput(cfg, componentIDToBinary, components, metricsCollectionIntervalString); err != nil { return nil, errors.New(err, "failed to inject monitoring output") } } @@ -534,8 +542,12 @@ func (b *BeatsMonitor) monitoringNamespace() string { return defaultMonitoringNamespace } -func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, monitoringOutputName string, componentList []component.Component) error { - metricsCollectionIntervalString := metricsCollectionInterval.String() +// injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object. +func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, metricsCollectionIntervalString string) error { + if metricsCollectionIntervalString == "" { + metricsCollectionIntervalString = defaultMetricsCollectionInterval.String() + } + monitoringNamespace := b.monitoringNamespace() fixedAgentName := strings.ReplaceAll(agentName, "-", "_") beatsStreams := make([]interface{}, 0, len(componentIDToBinary)) diff --git a/internal/pkg/agent/application/monitoring/v1_monitor_test.go b/internal/pkg/agent/application/monitoring/v1_monitor_test.go index f18a459b4cc..d380381b42a 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor_test.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor_test.go @@ -27,75 +27,145 @@ func TestMonitoringConfigMetricsInterval(t *testing.T) { agentInfo, err := info.NewAgentInfo(context.Background(), false) require.NoError(t, err, "Error creating agent info") - mCfg := &monitoringConfig{ - C: &monitoringcfg.MonitoringConfig{ - Enabled: true, - MonitorMetrics: true, - HTTP: &monitoringcfg.MonitoringHTTPConfig{ - Enabled: false, + tcs := []struct { + name string + monitoringCfg *monitoringConfig + policy map[string]any + expectedInterval time.Duration + }{ + { + name: "default metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + }, + }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, + }, }, + expectedInterval: defaultMetricsCollectionInterval, }, - } - - policy := map[string]any{ - "agent": map[string]any{ - "monitoring": map[string]any{ - "metrics": true, - "http": map[string]any{ - "enabled": false, + { + name: "agent config metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + MetricsPeriod: "20s", }, }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, + }, + }, + expectedInterval: 20 * time.Second, }, - "outputs": map[string]any{ - "default": map[string]any{}, + { + name: "policy metrics interval", + monitoringCfg: &monitoringConfig{ + C: &monitoringcfg.MonitoringConfig{ + Enabled: true, + MonitorMetrics: true, + HTTP: &monitoringcfg.MonitoringHTTPConfig{ + Enabled: false, + }, + MetricsPeriod: "20s", + }, + }, + policy: map[string]any{ + "agent": map[string]any{ + "monitoring": map[string]any{ + "metrics": true, + "http": map[string]any{ + "enabled": false, + }, + "metrics_period": "10s", + }, + }, + "outputs": map[string]any{ + "default": map[string]any{}, + }, + }, + expectedInterval: 10 * time.Second, }, } - b := &BeatsMonitor{ - enabled: true, - config: mCfg, - operatingSystem: runtime.GOOS, - agentInfo: agentInfo, - } - got, err := b.MonitoringConfig(policy, nil, map[string]string{"foobeat": "filebeat"}) // put a componentID/binary mapping to have something in the beats monitoring input - assert.NoError(t, err) - - rawInputs, ok := got["inputs"] - require.True(t, ok, "monitoring config contains no input") - inputs, ok := rawInputs.([]any) - require.True(t, ok, "monitoring inputs are not a list") - marshaledInputs, err := yaml.Marshal(inputs) - if assert.NoError(t, err, "error marshaling monitoring inputs") { - t.Logf("marshaled monitoring inputs:\n%s\n", marshaledInputs) - } - // loop over the created inputs - for _, i := range inputs { - input, ok := i.(map[string]any) - if assert.Truef(t, ok, "input is not represented as a map: %v", i) { - inputID := input["id"] - t.Logf("input %q", inputID) - // check the streams created for the input, should be a list of objects - if assert.Contains(t, input, "streams", "input %q does not contain any stream", inputID) && - assert.IsTypef(t, []any{}, input["streams"], "streams for input %q are not a list of objects", inputID) { - // loop over streams and cast to map[string]any to access keys - for _, rawStream := range input["streams"].([]any) { - if assert.IsTypef(t, map[string]any{}, rawStream, "stream %v for input %q is not a map", rawStream, inputID) { - stream := rawStream.(map[string]any) - // check period and assert its value - streamID := stream["id"] - if assert.Containsf(t, stream, "period", "stream %q for input %q does not contain a period", streamID, inputID) && - assert.IsType(t, "", stream["period"], "period for stream %q of input %q is not represented as a string", streamID, inputID) { - periodString := stream["period"].(string) - duration, err := time.ParseDuration(periodString) - if assert.NoErrorf(t, err, "Unparseable period duration %s for stream %q of input %q", periodString, streamID, inputID) { - assert.Equalf(t, duration, 60*time.Second, "unexpected duration for stream %q of input %q", streamID, inputID) + for _, tc := range tcs { + + t.Run(tc.name, func(t *testing.T) { + b := &BeatsMonitor{ + enabled: true, + config: tc.monitoringCfg, + operatingSystem: runtime.GOOS, + agentInfo: agentInfo, + } + got, err := b.MonitoringConfig(tc.policy, nil, map[string]string{"foobeat": "filebeat"}) // put a componentID/binary mapping to have something in the beats monitoring input + assert.NoError(t, err) + + rawInputs, ok := got["inputs"] + require.True(t, ok, "monitoring config contains no input") + inputs, ok := rawInputs.([]any) + require.True(t, ok, "monitoring inputs are not a list") + marshaledInputs, err := yaml.Marshal(inputs) + if assert.NoError(t, err, "error marshaling monitoring inputs") { + t.Logf("marshaled monitoring inputs:\n%s\n", marshaledInputs) + } + + // loop over the created inputs + for _, i := range inputs { + input, ok := i.(map[string]any) + if assert.Truef(t, ok, "input is not represented as a map: %v", i) { + inputID := input["id"] + t.Logf("input %q", inputID) + // check the streams created for the input, should be a list of objects + if assert.Contains(t, input, "streams", "input %q does not contain any stream", inputID) && + assert.IsTypef(t, []any{}, input["streams"], "streams for input %q are not a list of objects", inputID) { + // loop over streams and cast to map[string]any to access keys + for _, rawStream := range input["streams"].([]any) { + if assert.IsTypef(t, map[string]any{}, rawStream, "stream %v for input %q is not a map", rawStream, inputID) { + stream := rawStream.(map[string]any) + // check period and assert its value + streamID := stream["id"] + if assert.Containsf(t, stream, "period", "stream %q for input %q does not contain a period", streamID, inputID) && + assert.IsType(t, "", stream["period"], "period for stream %q of input %q is not represented as a string", streamID, inputID) { + periodString := stream["period"].(string) + duration, err := time.ParseDuration(periodString) + if assert.NoErrorf(t, err, "Unparseable period duration %s for stream %q of input %q", periodString, streamID, inputID) { + assert.Equalf(t, duration, tc.expectedInterval, "unexpected duration for stream %q of input %q", streamID, inputID) + } + } } } } } } - } - + }) } } diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go index cc6f345e123..2f026c00a64 100644 --- a/internal/pkg/core/monitoring/config/config.go +++ b/internal/pkg/core/monitoring/config/config.go @@ -24,6 +24,7 @@ type MonitoringConfig struct { Enabled bool `yaml:"enabled" config:"enabled"` MonitorLogs bool `yaml:"logs" config:"logs"` MonitorMetrics bool `yaml:"metrics" config:"metrics"` + MetricsPeriod string `yaml:"metrics_period" config:"metrics_period"` LogMetrics bool `yaml:"-" config:"-"` HTTP *MonitoringHTTPConfig `yaml:"http" config:"http"` Namespace string `yaml:"namespace" config:"namespace"`