From 554ebd647b2b6b12e1ea358831d97826efd605b3 Mon Sep 17 00:00:00 2001
From: Yuriy Tseretyan <yuriy.tseretyan@grafana.com>
Date: Tue, 12 Jul 2022 16:51:32 -0400
Subject: [PATCH] Alerting: Refactor Evaluator (#51673)

* AlertRule to return condition
* update ConditionEval to not return an error because it's always nil
* make getExprRequest private
* refactor executeCondition to just converter and move execution to the ConditionEval as this makes code more readable.
* log error if results have errors
* change signature of evaluate function to not return an error
---
 pkg/services/ngalert/api/api_testing.go      |  5 +-
 pkg/services/ngalert/api/api_testing_test.go |  4 +-
 pkg/services/ngalert/eval/eval.go            | 48 ++++++++++----------
 pkg/services/ngalert/eval/evaluator_mock.go  | 23 ++++------
 pkg/services/ngalert/models/alert_rule.go    |  8 ++++
 pkg/services/ngalert/schedule/schedule.go    | 22 ++++-----
 6 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/pkg/services/ngalert/api/api_testing.go b/pkg/services/ngalert/api/api_testing.go
index 32fd5e4e1a92d..5ff65e3731c78 100644
--- a/pkg/services/ngalert/api/api_testing.go
+++ b/pkg/services/ngalert/api/api_testing.go
@@ -55,10 +55,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *models.ReqContext, body a
 		now = timeNow()
 	}
 
-	evalResults, err := srv.evaluator.ConditionEval(&evalCond, now)
-	if err != nil {
-		return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")
-	}
+	evalResults := srv.evaluator.ConditionEval(evalCond, now)
 
 	frame := evalResults.AsDataFrame()
 	return response.JSONStreaming(http.StatusOK, util.DynMap{
diff --git a/pkg/services/ngalert/api/api_testing_test.go b/pkg/services/ngalert/api/api_testing_test.go
index ecebe2eedab12..f2f33984bcbb5 100644
--- a/pkg/services/ngalert/api/api_testing_test.go
+++ b/pkg/services/ngalert/api/api_testing_test.go
@@ -69,7 +69,7 @@ func TestRouteTestGrafanaRuleConfig(t *testing.T) {
 
 			evaluator := &eval.FakeEvaluator{}
 			var result []eval.Result
-			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result, nil)
+			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result)
 
 			srv := createTestingApiSrv(ds, ac, evaluator)
 
@@ -109,7 +109,7 @@ func TestRouteTestGrafanaRuleConfig(t *testing.T) {
 
 			evaluator := &eval.FakeEvaluator{}
 			var result []eval.Result
-			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result, nil)
+			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result)
 
 			srv := createTestingApiSrv(ds, ac, evaluator)
 
diff --git a/pkg/services/ngalert/eval/eval.go b/pkg/services/ngalert/eval/eval.go
index c98abf8ff09c9..c5c942a207a7c 100644
--- a/pkg/services/ngalert/eval/eval.go
+++ b/pkg/services/ngalert/eval/eval.go
@@ -28,7 +28,7 @@ import (
 //go:generate mockery --name Evaluator --structname FakeEvaluator --inpackage --filename evaluator_mock.go --with-expecter
 type Evaluator interface {
 	// ConditionEval executes conditions and evaluates the result.
-	ConditionEval(condition *models.Condition, now time.Time) (Results, error)
+	ConditionEval(condition models.Condition, now time.Time) Results
 	// QueriesAndExpressionsEval executes queries and expressions and returns the result.
 	QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time) (*backend.QueryDataResponse, error)
 }
@@ -89,6 +89,15 @@ type ExecutionResults struct {
 // Results is a slice of evaluated alert instances states.
 type Results []Result
 
+func (evalResults Results) HasErrors() bool {
+	for _, r := range evalResults {
+		if r.State == Error {
+			return true
+		}
+	}
+	return false
+}
+
 // Result contains the evaluated State of an alert instance
 // identified by its labels.
 type Result struct {
@@ -153,8 +162,8 @@ type AlertExecCtx struct {
 	Ctx context.Context
 }
 
-// GetExprRequest validates the condition, gets the datasource information and creates an expr.Request from it.
-func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dsCacheService datasources.CacheService, secretsService secrets.Service) (*expr.Request, error) {
+// getExprRequest validates the condition, gets the datasource information and creates an expr.Request from it.
+func getExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dsCacheService datasources.CacheService, secretsService secrets.Service) (*expr.Request, error) {
 	req := &expr.Request{
 		OrgId: ctx.OrgID,
 		Headers: map[string]string{
@@ -166,8 +175,7 @@ func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, d
 
 	datasources := make(map[string]*datasources.DataSource, len(data))
 
-	for i := range data {
-		q := data[i]
+	for _, q := range data {
 		model, err := q.GetModel()
 		if err != nil {
 			return nil, fmt.Errorf("failed to get query model: %w", err)
@@ -259,12 +267,7 @@ type NumberValueCapture struct {
 	Value  *float64
 }
 
-func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, exprService *expr.Service, dsCacheService datasources.CacheService, secretsService secrets.Service) ExecutionResults {
-	execResp, err := executeQueriesAndExpressions(ctx, c.Data, now, exprService, dsCacheService, secretsService)
-	if err != nil {
-		return ExecutionResults{Error: err}
-	}
-
+func queryDataResponseToExecutionResults(c models.Condition, execResp *backend.QueryDataResponse) ExecutionResults {
 	// eval captures for the '__value_string__' annotation and the Value property of the API response.
 	captures := make([]NumberValueCapture, 0, len(execResp.Responses))
 	captureVal := func(refID string, labels data.Labels, value *float64) {
@@ -356,7 +359,7 @@ func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, no
 		}
 	}()
 
-	queryDataReq, err := GetExprRequest(ctx, data, now, dsCacheService, secretsService)
+	queryDataReq, err := getExprRequest(ctx, data, now, dsCacheService, secretsService)
 	if err != nil {
 		return nil, err
 	}
@@ -564,8 +567,6 @@ func (evalResults Results) AsDataFrame() data.Frame {
 		labelColumns = append(labelColumns, k)
 	}
 
-	labelColumns = sort.StringSlice(labelColumns)
-
 	frame := data.NewFrame("evaluation results")
 	for _, lKey := range labelColumns {
 		frame.Fields = append(frame.Fields, data.NewField(lKey, nil, make([]string, fieldLen)))
@@ -591,16 +592,15 @@ func (evalResults Results) AsDataFrame() data.Frame {
 }
 
 // ConditionEval executes conditions and evaluates the result.
-func (e *evaluatorImpl) ConditionEval(condition *models.Condition, now time.Time) (Results, error) {
-	alertCtx, cancelFn := context.WithTimeout(context.Background(), e.cfg.UnifiedAlerting.EvaluationTimeout)
-	defer cancelFn()
-
-	alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.cfg.ExpressionsEnabled, Log: e.log}
-
-	execResult := executeCondition(alertExecCtx, condition, now, e.expressionService, e.dataSourceCache, e.secretsService)
-
-	evalResults := evaluateExecutionResult(execResult, now)
-	return evalResults, nil
+func (e *evaluatorImpl) ConditionEval(condition models.Condition, now time.Time) Results {
+	execResp, err := e.QueriesAndExpressionsEval(condition.OrgID, condition.Data, now)
+	var execResults ExecutionResults
+	if err != nil {
+		execResults = ExecutionResults{Error: err}
+	} else {
+		execResults = queryDataResponseToExecutionResults(condition, execResp)
+	}
+	return evaluateExecutionResult(execResults, now)
 }
 
 // QueriesAndExpressionsEval executes queries and expressions and returns the result.
diff --git a/pkg/services/ngalert/eval/evaluator_mock.go b/pkg/services/ngalert/eval/evaluator_mock.go
index ed3f46b7f2a4a..94e6064d11b07 100644
--- a/pkg/services/ngalert/eval/evaluator_mock.go
+++ b/pkg/services/ngalert/eval/evaluator_mock.go
@@ -25,11 +25,11 @@ func (_m *FakeEvaluator) EXPECT() *FakeEvaluator_Expecter {
 }
 
 // ConditionEval provides a mock function with given fields: condition, now
-func (_m *FakeEvaluator) ConditionEval(condition *models.Condition, now time.Time) (Results, error) {
+func (_m *FakeEvaluator) ConditionEval(condition models.Condition, now time.Time) Results {
 	ret := _m.Called(condition, now)
 
 	var r0 Results
-	if rf, ok := ret.Get(0).(func(*models.Condition, time.Time) Results); ok {
+	if rf, ok := ret.Get(0).(func(models.Condition, time.Time) Results); ok {
 		r0 = rf(condition, now)
 	} else {
 		if ret.Get(0) != nil {
@@ -37,14 +37,7 @@ func (_m *FakeEvaluator) ConditionEval(condition *models.Condition, now time.Tim
 		}
 	}
 
-	var r1 error
-	if rf, ok := ret.Get(1).(func(*models.Condition, time.Time) error); ok {
-		r1 = rf(condition, now)
-	} else {
-		r1 = ret.Error(1)
-	}
-
-	return r0, r1
+	return r0
 }
 
 // FakeEvaluator_ConditionEval_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ConditionEval'
@@ -53,21 +46,21 @@ type FakeEvaluator_ConditionEval_Call struct {
 }
 
 // ConditionEval is a helper method to define mock.On call
-//  - condition *models.Condition
+//  - condition models.Condition
 //  - now time.Time
 func (_e *FakeEvaluator_Expecter) ConditionEval(condition interface{}, now interface{}) *FakeEvaluator_ConditionEval_Call {
 	return &FakeEvaluator_ConditionEval_Call{Call: _e.mock.On("ConditionEval", condition, now)}
 }
 
-func (_c *FakeEvaluator_ConditionEval_Call) Run(run func(condition *models.Condition, now time.Time)) *FakeEvaluator_ConditionEval_Call {
+func (_c *FakeEvaluator_ConditionEval_Call) Run(run func(condition models.Condition, now time.Time)) *FakeEvaluator_ConditionEval_Call {
 	_c.Call.Run(func(args mock.Arguments) {
-		run(args[0].(*models.Condition), args[1].(time.Time))
+		run(args[0].(models.Condition), args[1].(time.Time))
 	})
 	return _c
 }
 
-func (_c *FakeEvaluator_ConditionEval_Call) Return(_a0 Results, _a1 error) *FakeEvaluator_ConditionEval_Call {
-	_c.Call.Return(_a0, _a1)
+func (_c *FakeEvaluator_ConditionEval_Call) Return(_a0 Results) *FakeEvaluator_ConditionEval_Call {
+	_c.Call.Return(_a0)
 	return _c
 }
 
diff --git a/pkg/services/ngalert/models/alert_rule.go b/pkg/services/ngalert/models/alert_rule.go
index 92d02b1a0e69d..0bbf8359e09a8 100644
--- a/pkg/services/ngalert/models/alert_rule.go
+++ b/pkg/services/ngalert/models/alert_rule.go
@@ -170,6 +170,14 @@ func (alertRule *AlertRule) GetLabels(opts ...LabelOption) map[string]string {
 	return labels
 }
 
+func (alertRule *AlertRule) GetEvalCondition() Condition {
+	return Condition{
+		Condition: alertRule.Condition,
+		OrgID:     alertRule.OrgID,
+		Data:      alertRule.Data,
+	}
+}
+
 // Diff calculates diff between two alert rules. Returns nil if two rules are equal. Otherwise, returns cmputil.DiffReport
 func (alertRule *AlertRule) Diff(rule *AlertRule, ignore ...string) cmputil.DiffReport {
 	var reporter cmputil.DiffReporter
diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go
index 20ec97aef0bbc..8d4fbf9874384 100644
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@@ -388,32 +388,25 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 		return q.Result, nil
 	}
 
-	evaluate := func(ctx context.Context, r *ngmodels.AlertRule, attempt int64, e *evaluation) error {
+	evaluate := func(ctx context.Context, r *ngmodels.AlertRule, attempt int64, e *evaluation) {
 		logger := logger.New("version", r.Version, "attempt", attempt, "now", e.scheduledAt)
 		start := sch.clock.Now()
 
-		condition := ngmodels.Condition{
-			Condition: r.Condition,
-			OrgID:     r.OrgID,
-			Data:      r.Data,
-		}
-		results, err := sch.evaluator.ConditionEval(&condition, e.scheduledAt)
+		results := sch.evaluator.ConditionEval(r.GetEvalCondition(), e.scheduledAt)
 		dur := sch.clock.Now().Sub(start)
 		evalTotal.Inc()
 		evalDuration.Observe(dur.Seconds())
-		if err != nil {
+		if results.HasErrors() {
 			evalTotalFailures.Inc()
-			// consider saving alert instance on error
-			logger.Error("failed to evaluate alert rule", "duration", dur, "err", err)
-			return err
+			logger.Error("failed to evaluate alert rule", "results", results, "duration", dur)
+		} else {
+			logger.Debug("alert rule evaluated", "results", results, "duration", dur)
 		}
-		logger.Debug("alert rule evaluated", "results", results, "duration", dur)
 
 		processedStates := sch.stateManager.ProcessEvalResults(ctx, e.scheduledAt, r, results)
 		sch.saveAlertStates(ctx, processedStates)
 		alerts := FromAlertStateToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
 		sch.alertsSender.Send(key, alerts)
-		return nil
 	}
 
 	retryIfError := func(f func(attempt int64) error) error {
@@ -475,7 +468,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 						currentRule = newRule
 						logger.Debug("new alert rule version fetched", "title", newRule.Title, "version", newRule.Version)
 					}
-					return evaluate(grafanaCtx, currentRule, attempt, ctx)
+					evaluate(grafanaCtx, currentRule, attempt, ctx)
+					return nil
 				})
 				if err != nil {
 					logger.Error("evaluation failed after all retries", "err", err)