Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce agent.monitoring.metrics_period #4961

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion _meta/config/common.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ inputs:
# logs: true
# # enables metrics monitoring
# metrics: true
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -77,7 +79,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion _meta/config/common.reference.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -156,7 +158,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
12 changes: 7 additions & 5 deletions _meta/config/elastic-agent.docker.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ inputs:
data_stream.namespace: default
use_output: default
streams:
- metricsets:
- metricsets:
- cpu
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
data_stream.dataset: system.cpu
- metricsets:
- metricsets:
- memory
data_stream.dataset: system.memory
- metricsets:
- metricsets:
- network
data_stream.dataset: system.network
- metricsets:
- metricsets:
- filesystem
data_stream.dataset: system.filesystem

Expand Down Expand Up @@ -112,6 +112,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -127,7 +129,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: feature

# Change summary; a 80ish characters long description of the change.
summary: support agent monitoring metrics interval
pkoutsovasilis marked this conversation as resolved.
Show resolved Hide resolved

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: elastic-agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/4961

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
12 changes: 7 additions & 5 deletions elastic-agent.docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ inputs:
data_stream.namespace: default
use_output: default
streams:
- metricsets:
- metricsets:
- cpu
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
data_stream.dataset: system.cpu
- metricsets:
- metricsets:
- memory
data_stream.dataset: system.memory
- metricsets:
- metricsets:
- network
data_stream.dataset: system.network
- metricsets:
- metricsets:
- filesystem
data_stream.dataset: system.filesystem

Expand Down Expand Up @@ -112,6 +112,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -127,7 +129,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion elastic-agent.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -162,7 +164,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion elastic-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ inputs:
# logs: true
# # enables metrics monitoring
# metrics: true
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -83,7 +85,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ agent:
http: null
logs: false
metrics: false
metrics_period: ""
namespace: ""
pprof: null
traces: true
Expand Down
24 changes: 20 additions & 4 deletions internal/pkg/agent/application/monitoring/v1_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
agentKey = "agent"
monitoringKey = "monitoring"
useOutputKey = "use_output"
monitoringMetricsPeriodKey = "metrics_period"
monitoringOutput = "monitoring"
defaultMonitoringNamespace = "default"
agentName = "elastic-agent"
Expand All @@ -58,7 +59,7 @@ const (

// metricset execution period used for the monitoring metrics inputs
// we set this to 60s to reduce the load/data volume on the monitoring cluster
metricsCollectionInterval = 60 * time.Second
defaultMetricsCollectionInterval = 60 * time.Second
)

var (
Expand Down Expand Up @@ -129,6 +130,7 @@ func (b *BeatsMonitor) MonitoringConfig(
cfg := make(map[string]interface{})

monitoringOutputName := defaultOutputName
metricsCollectionIntervalString := b.config.C.MetricsPeriod
if agentCfg, found := policy[agentKey]; found {
// The agent section is required for feature flags
cfg[agentKey] = agentCfg
Expand All @@ -143,6 +145,12 @@ func (b *BeatsMonitor) MonitoringConfig(
monitoringOutputName = useStr
}
}

if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found {
if metricsPeriodStr, ok := metricsPeriod.(string); ok {
metricsCollectionIntervalString = metricsPeriodStr
}
}
}
}
}
Expand All @@ -165,7 +173,7 @@ func (b *BeatsMonitor) MonitoringConfig(
}

if b.config.C.MonitorMetrics {
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap); err != nil {
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap, metricsCollectionIntervalString); err != nil {
return nil, errors.New(err, "failed to inject monitoring output")
}
}
Expand Down Expand Up @@ -542,8 +550,16 @@ func (b *BeatsMonitor) monitoringNamespace() string {
}

// injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object.
func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, existingStateServicePids map[string]uint64) error {
metricsCollectionIntervalString := metricsCollectionInterval.String()
func (b *BeatsMonitor) injectMetricsInput(
cfg map[string]interface{},
componentIDToBinary map[string]string,
componentList []component.Component,
existingStateServicePids map[string]uint64,
metricsCollectionIntervalString string,
) error {
if metricsCollectionIntervalString == "" {
metricsCollectionIntervalString = defaultMetricsCollectionInterval.String()
}
monitoringNamespace := b.monitoringNamespace()
fixedAgentName := strings.ReplaceAll(agentName, "-", "_")
beatsStreams := make([]interface{}, 0, len(componentIDToBinary))
Expand Down
Loading