client: emit optional telemetry from prerun and prestart hooks. (#24556)

The Nomad client can now optionally emit telemetry data from the prerun and prestart hooks. This allows operators to monitor and alert on failures and time taken to complete. The new datapoints are: - nomad.client.alloc_hook.prerun.success (counter) - nomad.client.alloc_hook.prerun.failed (counter) - nomad.client.alloc_hook.prerun.elapsed (sample) - nomad.client.task_hook.prestart.success (counter) - nomad.client.task_hook.prestart.failed (counter) - nomad.client.task_hook.prestart.elapsed (sample) The hook execution time is useful to Nomad engineering and will help optimize code where possible and understand job specification impacts on hook performance. Currently only the PreRun and PreStart hooks have telemetry enabled, so we limit the number of new metrics being produced.
hashicorp · Dec 12, 2024 · 7d48aa2 · 7d48aa2
1 parent 86bc7ed
commit 7d48aa2
Show file tree

Hide file tree

Showing 23 changed files with 515 additions and 57 deletions.
diff --git a/.changelog/24556.txt b/.changelog/24556.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+client: Emit telemetry from prerun and prestart hooks for monitoring and alerting
+```
diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go
@@ -10,9 +10,11 @@ import (
 	"sync"
 	"time"
 
+	"github.com/armon/go-metrics"
 	log "github.com/hashicorp/go-hclog"
 	multierror "github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/client/allocdir"
+	"github.com/hashicorp/nomad/client/allocrunner/hookstats"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	"github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
@@ -44,6 +46,7 @@ import (
 
 // allocRunner is used to run all the tasks in a given allocation
 type allocRunner struct {
+
 	// id is the ID of the allocation. Can be accessed without a lock
 	id string
 
@@ -53,6 +56,15 @@ type allocRunner struct {
 	// clientConfig is the client configuration block.
 	clientConfig *config.Config
 
+	// clientBaseLabels are the base metric labels generated by the client.
+	// These can be used by processes which emit metrics that want to include
+	// these labels that include node_id, node_class, and node_pool.
+	//
+	// They could be generated using the clientConfig, but the kv pairs will
+	// not change unless the client process is fully restarted, so passing them
+	// along avoids extra work.
+	clientBaseLabels []metrics.Label
+
 	// stateUpdater is used to emit updated alloc state
 	stateUpdater cinterfaces.AllocStateHandler
 
@@ -87,6 +99,10 @@ type allocRunner struct {
 	// vaultClientFunc is used to get the client used to manage Vault tokens
 	vaultClientFunc vaultclient.VaultClientFunc
 
+	// hookStatsHandler is used by certain hooks to emit telemetry data, if the
+	// operator has not disabled this functionality.
+	hookStatsHandler interfaces.HookStatsHandler
+
 	// waitCh is closed when the Run loop has exited
 	waitCh chan struct{}
 
@@ -230,6 +246,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
 		id:                       alloc.ID,
 		alloc:                    alloc,
 		clientConfig:             config.ClientConfig,
+		clientBaseLabels:         config.BaseLabels,
 		consulServicesHandler:    config.ConsulServices,
 		consulProxiesClientFunc:  config.ConsulProxiesFunc,
 		sidsClient:               config.ConsulSI,
@@ -269,6 +286,8 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
 	// Create alloc broadcaster
 	ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)
 
+	ar.setHookStatsHandler(alloc.Namespace)
+
 	// Create alloc dir
 	ar.allocDir = allocdir.NewAllocDir(
 		ar.logger,
@@ -315,6 +334,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
 		trConfig := &taskrunner.Config{
 			Alloc:               ar.alloc,
 			ClientConfig:        ar.clientConfig,
+			ClientBaseLabels:    ar.clientBaseLabels,
 			Task:                task,
 			TaskDir:             ar.allocDir.NewTaskDir(task),
 			Logger:              ar.logger,
@@ -1549,3 +1569,23 @@ func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error)
 	}
 	return allocVols.CSIVolumes, nil
 }
+
+// setHookStatsHandler builds the hook stats handler based on whether the
+// operator has disabled this or not.
+//
+// The non-noop implementation is built using the base client labels and
+// the allocation namespace. The caller will add "hook_name". It would be
+// possible to add more labels, however, this would cause the metric
+// cardinality to increase dramatically without much benefit.
+//
+// This could be inline within the NewAllocRunner function, but having a
+// separate function makes testing easier.
+func (ar *allocRunner) setHookStatsHandler(ns string) {
+	if ar.clientConfig.DisableAllocationHookMetrics {
+		ar.hookStatsHandler = hookstats.NewNoOpHandler()
+	} else {
+		labels := ar.clientBaseLabels
+		labels = append(labels, metrics.Label{Name: "namespace", Value: ns})
+		ar.hookStatsHandler = hookstats.NewHandler(labels, "alloc_hook")
+	}
+}
diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go
@@ -189,7 +189,17 @@ func (ar *allocRunner) prerun() error {
 			ar.logger.Trace("running pre-run hook", "name", name, "start", start)
 		}
 
-		if err := pre.Prerun(); err != nil {
+		// If the operator has disabled hook metrics, then don't call the time
+		// function to save 30ns per hook.
+		var hookExecutionStart time.Time
+
+		if !ar.clientConfig.DisableAllocationHookMetrics {
+			hookExecutionStart = time.Now()
+		}
+
+		err := pre.Prerun()
+		ar.hookStatsHandler.Emit(hookExecutionStart, name, "prerun", err)
+		if err != nil {
 			return fmt.Errorf("pre-run hook %q failed: %v", name, err)
 		}
 

diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go
@@ -12,15 +12,18 @@ import (
 	"testing"
 	"time"
 
+	"github.com/armon/go-metrics"
 	"github.com/hashicorp/consul/api"
 	multierror "github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/client/allochealth"
+	"github.com/hashicorp/nomad/client/allocrunner/hookstats"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	arstate "github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
 	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
 	"github.com/hashicorp/nomad/client/allocwatcher"
+	client "github.com/hashicorp/nomad/client/config"
 	"github.com/hashicorp/nomad/client/lib/proclib"
 	"github.com/hashicorp/nomad/client/serviceregistration"
 	regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
@@ -2531,3 +2534,31 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) {
 	calloc = ar.clientAlloc(map[string]*structs.TaskState{})
 	must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc))
 }
+
+func TestAllocRunner_setHookStatsHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	// Create an allocation runner that doesn't have any configuration, which
+	// means the operator has not disabled hook metrics.
+	baseAllocRunner := &allocRunner{
+		clientConfig:     &client.Config{},
+		clientBaseLabels: []metrics.Label{},
+	}
+
+	baseAllocRunner.setHookStatsHandler("platform")
+	handler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.Handler)
+	must.True(t, ok)
+	must.NotNil(t, handler)
+
+	// Create a new allocation runner but explicitly disable hook metrics
+	// collection.
+	baseAllocRunner = &allocRunner{
+		clientConfig:     &client.Config{DisableAllocationHookMetrics: true},
+		clientBaseLabels: []metrics.Label{},
+	}
+
+	baseAllocRunner.setHookStatsHandler("platform")
+	noopHandler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.NoOpHandler)
+	must.True(t, ok)
+	must.NotNil(t, noopHandler)
+}
diff --git a/client/allocrunner/hookstats/hookstats.go b/client/allocrunner/hookstats/hookstats.go
@@ -0,0 +1,53 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package hookstats
+
+import (
+	"time"
+
+	"github.com/armon/go-metrics"
+	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
+)
+
+// Handler implements interfaces.HookStatsHandler and is used when the operator
+// has not disabled hook metrics.
+type Handler struct {
+	baseLabels []metrics.Label
+	runnerType string
+}
+
+// NewHandler creates a new hook stats handler to be used for emitting hook
+// stats for operator alerting and performance identification. The base labels
+// should be passed from the client set of labels and the runner type indicates
+// if the hooks are run from the alloc or task runner.
+func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler {
+	return &Handler{
+		baseLabels: base,
+		runnerType: runnerType,
+	}
+}
+
+func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) {
+
+	// Add the hook name to the base labels array, so we have a complete set to
+	// add to the metrics. Operators do not want this as part of the metric
+	// name due to cardinality control.
+	labels := h.baseLabels
+	labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName})
+
+	metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels)
+	if err != nil {
+		metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels)
+	} else {
+		metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels)
+	}
+}
+
+// NoOpHandler implements interfaces.HookStatsHandler and is used when the
+// operator has disabled hook metrics.
+type NoOpHandler struct{}
+
+func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} }
+
+func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {}
diff --git a/client/allocrunner/hookstats/hookstats_test.go b/client/allocrunner/hookstats/hookstats_test.go
@@ -0,0 +1,113 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: BUSL-1.1
+
+package hookstats
+
+import (
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/armon/go-metrics"
+	"github.com/hashicorp/nomad/ci"
+	"github.com/shoenig/test/must"
+)
+
+func TestHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	// Generate base labels that represent what an operator would see and then
+	// create out new handler to interact with.
+	baseLabels := []metrics.Label{
+		{Name: "datacenter", Value: "dc1"},
+		{Name: "node_class", Value: "none"},
+		{Name: "node_pool", Value: "default"},
+		{Name: "namespace", Value: "default"},
+		{Name: "host", Value: "client-5d3c"},
+		{Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"},
+	}
+	newHandler := NewHandler(baseLabels, "test_hook")
+
+	// The data stored is within the in-memory sink as map entries, so we need
+	// to know the key names to pull this out correctly. Build those now.
+	var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string
+
+	for _, label := range baseLabels {
+		metricKeySuffix += ";" + label.Name + "=" + label.Value
+	}
+
+	metricKeySuffix += ";" + "hook_name=test_hook_name"
+	sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix
+	counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix
+	counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix
+
+	// Create an in-memory sink and global, so we can actually look at and test
+	// the metrics we emit.
+	inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Emit hook related metrics where the supplied error is nil and check that
+	// the data is as expected.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil)
+
+	sinkData := inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName)
+	must.MapContainsKey(t, sinkData[0].Samples, sampleName)
+
+	successCounter := sinkData[0].Counters[counterSuccessName]
+	must.Eq(t, 1, successCounter.Count)
+	must.Eq(t, 1, successCounter.Sum)
+
+	sample1 := sinkData[0].Samples[sampleName]
+	must.Eq(t, 1, sample1.Count)
+	must.True(t, sample1.Sum > 0)
+
+	// Create a new in-memory sink and global collector to ensure we don't have
+	// leftovers from the previous test.
+	inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Emit a hook related metrics where the supplied error is non-nil and
+	// check that the data is as expected.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))
+
+	sinkData = inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapContainsKey(t, sinkData[0].Counters, counterFailureName)
+	must.MapContainsKey(t, sinkData[0].Samples, sampleName)
+
+	failureCounter := sinkData[0].Counters[counterFailureName]
+	must.Eq(t, 1, failureCounter.Count)
+	must.Eq(t, 1, failureCounter.Sum)
+
+	sample2 := sinkData[0].Samples[sampleName]
+	must.Eq(t, 1, sample2.Count)
+	must.True(t, sample2.Sum > 0)
+}
+
+func TestNoOpHandler(t *testing.T) {
+	ci.Parallel(t)
+
+	newHandler := NewNoOpHandler()
+
+	// Create a new in-memory sink and global collector, so we can test that no
+	// metrics are emitted.
+	inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond)
+
+	_, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink)
+	must.NoError(t, err)
+
+	// Call the function with a non-nil error and check the results of the
+	// in-memory sink.
+	newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error"))
+
+	sinkData := inMemorySink.Data()
+	must.Len(t, 1, sinkData)
+	must.MapLen(t, 0, sinkData[0].Counters)
+	must.MapLen(t, 0, sinkData[0].Samples)
+}
diff --git a/client/allocrunner/interfaces/runner.go b/client/allocrunner/interfaces/runner.go
@@ -4,6 +4,8 @@
 package interfaces
 
 import (
+	"time"
+
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/allocrunner/state"
 	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
@@ -71,3 +73,25 @@ type HookResourceSetter interface {
 	SetCSIMounts(map[string]*csimanager.MountInfo)
 	GetCSIMounts(map[string]*csimanager.MountInfo)
 }
+
+// HookStatsHandler defines the interface used to emit metrics for the alloc
+// and task runner hooks.
+type HookStatsHandler interface {
+
+	// Emit is called once the hook has run, indicating the desired metrics
+	// should be emitted, if configured.
+	//
+	// Args:
+	//  start: The time logged as the hook was triggered. This is used for the
+	//    elapsed time metric.
+	//
+	//  hookName: The name of the hook that was run, as returned typically by
+	//    the Name() hook function.
+	//
+	//  hookType: The type of hook such as "prerun" or "postrun".
+	//
+	//  err: The error returned from the hook execution. The caller should not
+	//    need to check whether this is nil or not before called this function.
+	//
+	Emit(start time.Time, hookName, hookType string, err error)
+}