Skip to content

Commit

Permalink
feat: introduce agent.monitoring.metrics_period
Browse files Browse the repository at this point in the history
  • Loading branch information
pkoutsovasilis committed Jun 19, 2024
1 parent 4ccdd09 commit 709f03e
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 74 deletions.
4 changes: 3 additions & 1 deletion _meta/config/common.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ inputs:
# logs: true
# # enables metrics monitoring
# metrics: true
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -77,7 +79,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion _meta/config/common.reference.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -156,7 +158,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
12 changes: 7 additions & 5 deletions _meta/config/elastic-agent.docker.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ inputs:
data_stream.namespace: default
use_output: default
streams:
- metricsets:
- metricsets:
- cpu
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
data_stream.dataset: system.cpu
- metricsets:
- metricsets:
- memory
data_stream.dataset: system.memory
- metricsets:
- metricsets:
- network
data_stream.dataset: system.network
- metricsets:
- metricsets:
- filesystem
data_stream.dataset: system.filesystem

Expand Down Expand Up @@ -112,6 +112,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -127,7 +129,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
12 changes: 7 additions & 5 deletions elastic-agent.docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ inputs:
data_stream.namespace: default
use_output: default
streams:
- metricsets:
- metricsets:
- cpu
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
data_stream.dataset: system.cpu
- metricsets:
- metricsets:
- memory
data_stream.dataset: system.memory
- metricsets:
- metricsets:
- network
data_stream.dataset: system.network
- metricsets:
- metricsets:
- filesystem
data_stream.dataset: system.filesystem

Expand Down Expand Up @@ -112,6 +112,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -127,7 +129,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion elastic-agent.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ inputs:
# logs: false
# # enables metrics monitoring
# metrics: false
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -162,7 +164,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
4 changes: 3 additions & 1 deletion elastic-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ inputs:
# logs: true
# # enables metrics monitoring
# metrics: true
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
# metrics_period: 60s
# # exposes /debug/pprof/ endpoints
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
# pprof.enabled: false
Expand All @@ -83,7 +85,7 @@ inputs:
# # The possible values for `failon` are:
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
# http:
# # enables http endpoint
Expand Down
24 changes: 20 additions & 4 deletions internal/pkg/agent/application/monitoring/v1_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
agentKey = "agent"
monitoringKey = "monitoring"
useOutputKey = "use_output"
monitoringMetricsPeriodKey = "metrics_period"
monitoringOutput = "monitoring"
defaultMonitoringNamespace = "default"
agentName = "elastic-agent"
Expand All @@ -58,7 +59,7 @@ const (

// metricset execution period used for the monitoring metrics inputs
// we set this to 60s to reduce the load/data volume on the monitoring cluster
metricsCollectionInterval = 60 * time.Second
defaultMetricsCollectionInterval = 60 * time.Second
)

var (
Expand Down Expand Up @@ -129,6 +130,7 @@ func (b *BeatsMonitor) MonitoringConfig(
cfg := make(map[string]interface{})

monitoringOutputName := defaultOutputName
metricsCollectionIntervalString := b.config.C.MetricsPeriod
if agentCfg, found := policy[agentKey]; found {
// The agent section is required for feature flags
cfg[agentKey] = agentCfg
Expand All @@ -143,6 +145,12 @@ func (b *BeatsMonitor) MonitoringConfig(
monitoringOutputName = useStr
}
}

if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found {
if metricsPeriodStr, ok := metricsPeriod.(string); ok {
metricsCollectionIntervalString = metricsPeriodStr
}
}
}
}
}
Expand All @@ -165,7 +173,7 @@ func (b *BeatsMonitor) MonitoringConfig(
}

if b.config.C.MonitorMetrics {
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap); err != nil {
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap, metricsCollectionIntervalString); err != nil {
return nil, errors.New(err, "failed to inject monitoring output")
}
}
Expand Down Expand Up @@ -542,8 +550,16 @@ func (b *BeatsMonitor) monitoringNamespace() string {
}

// injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object.
func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, existingStateServicePids map[string]uint64) error {
metricsCollectionIntervalString := metricsCollectionInterval.String()
func (b *BeatsMonitor) injectMetricsInput(
cfg map[string]interface{},
componentIDToBinary map[string]string,
componentList []component.Component,
existingStateServicePids map[string]uint64,
metricsCollectionIntervalString string,
) error {
if metricsCollectionIntervalString == "" {
metricsCollectionIntervalString = defaultMetricsCollectionInterval.String()
}
monitoringNamespace := b.monitoringNamespace()
fixedAgentName := strings.ReplaceAll(agentName, "-", "_")
beatsStreams := make([]interface{}, 0, len(componentIDToBinary))
Expand Down
Loading

0 comments on commit 709f03e

Please sign in to comment.