Metricbeat: add configurable failure threshold before reporting strea…

…ms as degraded (elastic#41570) * Metricbeat: add configurable failure threshold before reporting streams as degraded With this change it is possible to configure a threshold for the number of consecutive errors that may happen while fetching metrics for a given stream before the stream gets marked as DEGRADED. To configure such threshold, add a "failure_threshold": <n> to a module configuration block. Depending on the value of <n> the threshold will be configured in different ways: n == 0: status reporting for the stream has been disabled, the stream will never become DEGRADED no matter how many errors are encountered while fetching metrics n==1 or failure_threshold not specified: backward compatible behavior, the stream will become DEGRADED at the first error encountered n > 1: stream will become DEGRADED after at least n consecutive errors have been encountered When a fetch operation completes without errors the consecutive errors counter is reset and the stream is set to HEALTHY.
oakrizan · Nov 19, 2024 · f84c05b · f84c05b
1 parent b219763
commit f84c05b
Show file tree

Hide file tree

Showing 2 changed files with 636 additions and 44 deletions.
diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go
@@ -36,11 +36,15 @@ import (
 	"github.com/elastic/elastic-agent-libs/testing"
 )
 
-// Expvar metric names.
 const (
-	successesKey = "success"
-	failuresKey  = "failures"
-	eventsKey    = "events"
+	// Expvar metric names.
+	successesKey           = "success"
+	failuresKey            = "failures"
+	eventsKey              = "events"
+	consecutiveFailuresKey = "consecutive_failures"
+
+	// Failure threshold config key
+	failureThresholdKey = "failure_threshold"
 )
 
 var (
@@ -70,16 +74,18 @@ type metricSetWrapper struct {
 	module *Wrapper // Parent Module.
 	stats  *stats   // stats for this MetricSet.
 
-	periodic bool // Set to true if this metricset is a periodic fetcher
+	periodic         bool // Set to true if this metricset is a periodic fetcher
+	failureThreshold uint // threshold of consecutive errors needed to set the stream as degraded
 }
 
 // stats bundles common metricset stats.
 type stats struct {
-	key      string          // full stats key
-	ref      uint32          // number of modules/metricsets reusing stats instance
-	success  *monitoring.Int // Total success events.
-	failures *monitoring.Int // Total error events.
-	events   *monitoring.Int // Total events published.
+	key                 string           // full stats key
+	ref                 uint32           // number of modules/metricsets reusing stats instance
+	success             *monitoring.Int  // Total success events.
+	failures            *monitoring.Int  // Total error events.
+	events              *monitoring.Int  // Total events published.
+	consecutiveFailures *monitoring.Uint // Consecutive failures fetching this metricset
 }
 
 // NewWrapper creates a new module and its associated metricsets based on the given configuration.
@@ -106,11 +112,28 @@ func createWrapper(module mb.Module, metricSets []mb.MetricSet, options ...Optio
 		applyOption(wrapper)
 	}
 
+	failureThreshold := uint(1)
+
+	var streamHealthSettings struct {
+		FailureThreshold *uint `config:"failure_threshold"`
+	}
+
+	err := module.UnpackConfig(&streamHealthSettings)
+
+	if err != nil {
+		return nil, fmt.Errorf("unpacking raw config: %w", err)
+	}
+
+	if streamHealthSettings.FailureThreshold != nil {
+		failureThreshold = *streamHealthSettings.FailureThreshold
+	}
+
 	for i, metricSet := range metricSets {
 		wrapper.metricSets[i] = &metricSetWrapper{
-			MetricSet: metricSet,
-			module:    wrapper,
-			stats:     getMetricSetStats(wrapper.Name(), metricSet.Name()),
+			MetricSet:        metricSet,
+			module:           wrapper,
+			stats:            getMetricSetStats(wrapper.Name(), metricSet.Name()),
+			failureThreshold: failureThreshold,
 		}
 	}
 	return wrapper, nil
@@ -254,35 +277,11 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) {
 	case mb.ReportingMetricSetV2Error:
 		reporter.StartFetchTimer()
 		err := fetcher.Fetch(reporter.V2())
-		if err != nil {
-			reporter.V2().Error(err)
-			if errors.As(err, &mb.PartialMetricsError{}) {
-				// mark module as running if metrics are partially available and display the error message
-				msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			} else {
-				// mark it as degraded for any other issue encountered
-				msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			}
-			logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
-		} else {
-			msw.module.UpdateStatus(status.Running, "")
-		}
+		msw.handleFetchError(err, reporter.V2())
 	case mb.ReportingMetricSetV2WithContext:
 		reporter.StartFetchTimer()
 		err := fetcher.Fetch(ctx, reporter.V2())
-		if err != nil {
-			reporter.V2().Error(err)
-			if errors.As(err, &mb.PartialMetricsError{}) {
-				// mark module as running if metrics are partially available and display the error message
-				msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			} else {
-				// mark it as degraded for any other issue encountered
-				msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
-			}
-			logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
-		} else {
-			msw.module.UpdateStatus(status.Running, "")
-		}
+		msw.handleFetchError(err, reporter.V2())
 	default:
 		panic(fmt.Sprintf("unexpected fetcher type for %v", msw))
 	}
@@ -311,6 +310,31 @@ func (msw *metricSetWrapper) Test(d testing.Driver) {
 	})
 }
 
+func (msw *metricSetWrapper) handleFetchError(err error, reporter mb.PushReporterV2) {
+	switch {
+	case err == nil:
+		msw.stats.consecutiveFailures.Set(0)
+		msw.module.UpdateStatus(status.Running, "")
+
+	case errors.As(err, &mb.PartialMetricsError{}):
+		reporter.Error(err)
+		msw.stats.consecutiveFailures.Set(0)
+		// mark module as running if metrics are partially available and display the error message
+		msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
+		logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
+
+	default:
+		reporter.Error(err)
+		msw.stats.consecutiveFailures.Inc()
+		if msw.failureThreshold > 0 && msw.stats.consecutiveFailures != nil && uint(msw.stats.consecutiveFailures.Get()) >= msw.failureThreshold {
+			// mark it as degraded for any other issue encountered
+			msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
+		}
+		logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
+
+	}
+}
+
 type reporter interface {
 	StartFetchTimer()
 	V1() mb.PushReporter //nolint:staticcheck // PushReporter is deprecated but not removed
@@ -437,11 +461,12 @@ func getMetricSetStats(module, name string) *stats {
 
 	reg := monitoring.Default.NewRegistry(key)
 	s := &stats{
-		key:      key,
-		ref:      1,
-		success:  monitoring.NewInt(reg, successesKey),
-		failures: monitoring.NewInt(reg, failuresKey),
-		events:   monitoring.NewInt(reg, eventsKey),
+		key:                 key,
+		ref:                 1,
+		success:             monitoring.NewInt(reg, successesKey),
+		failures:            monitoring.NewInt(reg, failuresKey),
+		events:              monitoring.NewInt(reg, eventsKey),
+		consecutiveFailures: monitoring.NewUint(reg, consecutiveFailuresKey),
 	}
 
 	fetches[key] = s