Alerting: Refactor Evaluator (#51673)

* AlertRule to return condition * update ConditionEval to not return an error because it's always nil * make getExprRequest private * refactor executeCondition to just converter and move execution to the ConditionEval as this makes code more readable. * log error if results have errors * change signature of evaluate function to not return an error
2025-02-25 18:55:37 -06:00 · 2022-07-12 16:51:32 -04:00 · 2022-07-12 16:51:32 -04:00 · 554ebd647b
commit 554ebd647b
parent 2d8a91a846
6 changed files with 51 additions and 59 deletions
--- a/pkg/services/ngalert/api/api_testing.go
+++ b/pkg/services/ngalert/api/api_testing.go
@ -55,10 +55,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *models.ReqContext, body a
 		now = timeNow()
 	}

-	evalResults, err := srv.evaluator.ConditionEval(&evalCond, now)
-	if err != nil {
-		return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")
-	}
+	evalResults := srv.evaluator.ConditionEval(evalCond, now)

 	frame := evalResults.AsDataFrame()
 	return response.JSONStreaming(http.StatusOK, util.DynMap{
--- a/pkg/services/ngalert/api/api_testing_test.go
+++ b/pkg/services/ngalert/api/api_testing_test.go
@ -69,7 +69,7 @@ func TestRouteTestGrafanaRuleConfig(t *testing.T) {

 			evaluator := &eval.FakeEvaluator{}
 			var result []eval.Result
-			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result, nil)
+			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result)

 			srv := createTestingApiSrv(ds, ac, evaluator)

@ -109,7 +109,7 @@ func TestRouteTestGrafanaRuleConfig(t *testing.T) {

 			evaluator := &eval.FakeEvaluator{}
 			var result []eval.Result
-			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result, nil)
+			evaluator.EXPECT().ConditionEval(mock.Anything, mock.Anything).Return(result)

 			srv := createTestingApiSrv(ds, ac, evaluator)

--- a/pkg/services/ngalert/eval/eval.go
+++ b/pkg/services/ngalert/eval/eval.go
@ -28,7 +28,7 @@ import (
 //go:generate mockery --name Evaluator --structname FakeEvaluator --inpackage --filename evaluator_mock.go --with-expecter
 type Evaluator interface {
 	// ConditionEval executes conditions and evaluates the result.
-	ConditionEval(condition *models.Condition, now time.Time) (Results, error)
+	ConditionEval(condition models.Condition, now time.Time) Results
 	// QueriesAndExpressionsEval executes queries and expressions and returns the result.
 	QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time) (*backend.QueryDataResponse, error)
 }
@ -89,6 +89,15 @@ type ExecutionResults struct {
 // Results is a slice of evaluated alert instances states.
 type Results []Result

+func (evalResults Results) HasErrors() bool {
+	for _, r := range evalResults {
+		if r.State == Error {
+			return true
+		}
+	}
+	return false
+}
+
 // Result contains the evaluated State of an alert instance
 // identified by its labels.
 type Result struct {
@ -153,8 +162,8 @@ type AlertExecCtx struct {
 	Ctx context.Context
 }

-// GetExprRequest validates the condition, gets the datasource information and creates an expr.Request from it.
-func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dsCacheService datasources.CacheService, secretsService secrets.Service) (*expr.Request, error) {
+// getExprRequest validates the condition, gets the datasource information and creates an expr.Request from it.
+func getExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dsCacheService datasources.CacheService, secretsService secrets.Service) (*expr.Request, error) {
 	req := &expr.Request{
 		OrgId: ctx.OrgID,
 		Headers: map[string]string{
@ -166,8 +175,7 @@ func GetExprRequest(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, d

 	datasources := make(map[string]*datasources.DataSource, len(data))

-	for i := range data {
-		q := data[i]
+	for _, q := range data {
 		model, err := q.GetModel()
 		if err != nil {
 			return nil, fmt.Errorf("failed to get query model: %w", err)
@ -259,12 +267,7 @@ type NumberValueCapture struct {
 	Value  *float64
 }

-func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, exprService *expr.Service, dsCacheService datasources.CacheService, secretsService secrets.Service) ExecutionResults {
-	execResp, err := executeQueriesAndExpressions(ctx, c.Data, now, exprService, dsCacheService, secretsService)
-	if err != nil {
-		return ExecutionResults{Error: err}
-	}
-
+func queryDataResponseToExecutionResults(c models.Condition, execResp *backend.QueryDataResponse) ExecutionResults {
 	// eval captures for the '__value_string__' annotation and the Value property of the API response.
 	captures := make([]NumberValueCapture, 0, len(execResp.Responses))
 	captureVal := func(refID string, labels data.Labels, value *float64) {
@ -356,7 +359,7 @@ func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, no
 		}
 	}()

-	queryDataReq, err := GetExprRequest(ctx, data, now, dsCacheService, secretsService)
+	queryDataReq, err := getExprRequest(ctx, data, now, dsCacheService, secretsService)
 	if err != nil {
 		return nil, err
 	}
@ -564,8 +567,6 @@ func (evalResults Results) AsDataFrame() data.Frame {
 		labelColumns = append(labelColumns, k)
 	}

-	labelColumns = sort.StringSlice(labelColumns)
-
 	frame := data.NewFrame("evaluation results")
 	for _, lKey := range labelColumns {
 		frame.Fields = append(frame.Fields, data.NewField(lKey, nil, make([]string, fieldLen)))
@ -591,16 +592,15 @@ func (evalResults Results) AsDataFrame() data.Frame {
 }

 // ConditionEval executes conditions and evaluates the result.
-func (e *evaluatorImpl) ConditionEval(condition *models.Condition, now time.Time) (Results, error) {
-	alertCtx, cancelFn := context.WithTimeout(context.Background(), e.cfg.UnifiedAlerting.EvaluationTimeout)
-	defer cancelFn()
-
-	alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.cfg.ExpressionsEnabled, Log: e.log}
-
-	execResult := executeCondition(alertExecCtx, condition, now, e.expressionService, e.dataSourceCache, e.secretsService)
-
-	evalResults := evaluateExecutionResult(execResult, now)
-	return evalResults, nil
+func (e *evaluatorImpl) ConditionEval(condition models.Condition, now time.Time) Results {
+	execResp, err := e.QueriesAndExpressionsEval(condition.OrgID, condition.Data, now)
+	var execResults ExecutionResults
+	if err != nil {
+		execResults = ExecutionResults{Error: err}
+	} else {
+		execResults = queryDataResponseToExecutionResults(condition, execResp)
+	}
+	return evaluateExecutionResult(execResults, now)
 }

 // QueriesAndExpressionsEval executes queries and expressions and returns the result.
--- a/pkg/services/ngalert/eval/evaluator_mock.go
+++ b/pkg/services/ngalert/eval/evaluator_mock.go
@ -25,11 +25,11 @@ func (_m *FakeEvaluator) EXPECT() *FakeEvaluator_Expecter {
 }

 // ConditionEval provides a mock function with given fields: condition, now
-func (_m *FakeEvaluator) ConditionEval(condition *models.Condition, now time.Time) (Results, error) {
+func (_m *FakeEvaluator) ConditionEval(condition models.Condition, now time.Time) Results {
 	ret := _m.Called(condition, now)

 	var r0 Results
-	if rf, ok := ret.Get(0).(func(*models.Condition, time.Time) Results); ok {
+	if rf, ok := ret.Get(0).(func(models.Condition, time.Time) Results); ok {
 		r0 = rf(condition, now)
 	} else {
 		if ret.Get(0) != nil {
@ -37,14 +37,7 @@ func (_m *FakeEvaluator) ConditionEval(condition *models.Condition, now time.Tim
 		}
 	}

-	var r1 error
-	if rf, ok := ret.Get(1).(func(*models.Condition, time.Time) error); ok {
-		r1 = rf(condition, now)
-	} else {
-		r1 = ret.Error(1)
-	}
-
-	return r0, r1
+	return r0
 }

 // FakeEvaluator_ConditionEval_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ConditionEval'
@ -53,21 +46,21 @@ type FakeEvaluator_ConditionEval_Call struct {
 }

 // ConditionEval is a helper method to define mock.On call
-//  - condition *models.Condition
+//  - condition models.Condition
 //  - now time.Time
 func (_e *FakeEvaluator_Expecter) ConditionEval(condition interface{}, now interface{}) *FakeEvaluator_ConditionEval_Call {
 	return &FakeEvaluator_ConditionEval_Call{Call: _e.mock.On("ConditionEval", condition, now)}
 }

-func (_c *FakeEvaluator_ConditionEval_Call) Run(run func(condition *models.Condition, now time.Time)) *FakeEvaluator_ConditionEval_Call {
+func (_c *FakeEvaluator_ConditionEval_Call) Run(run func(condition models.Condition, now time.Time)) *FakeEvaluator_ConditionEval_Call {
 	_c.Call.Run(func(args mock.Arguments) {
-		run(args[0].(*models.Condition), args[1].(time.Time))
+		run(args[0].(models.Condition), args[1].(time.Time))
 	})
 	return _c
 }

-func (_c *FakeEvaluator_ConditionEval_Call) Return(_a0 Results, _a1 error) *FakeEvaluator_ConditionEval_Call {
-	_c.Call.Return(_a0, _a1)
+func (_c *FakeEvaluator_ConditionEval_Call) Return(_a0 Results) *FakeEvaluator_ConditionEval_Call {
+	_c.Call.Return(_a0)
 	return _c
 }

--- a/pkg/services/ngalert/models/alert_rule.go
+++ b/pkg/services/ngalert/models/alert_rule.go
@ -170,6 +170,14 @@ func (alertRule *AlertRule) GetLabels(opts ...LabelOption) map[string]string {
 	return labels
 }

+func (alertRule *AlertRule) GetEvalCondition() Condition {
+	return Condition{
+		Condition: alertRule.Condition,
+		OrgID:     alertRule.OrgID,
+		Data:      alertRule.Data,
+	}
+}
+
 // Diff calculates diff between two alert rules. Returns nil if two rules are equal. Otherwise, returns cmputil.DiffReport
 func (alertRule *AlertRule) Diff(rule *AlertRule, ignore ...string) cmputil.DiffReport {
 	var reporter cmputil.DiffReporter
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -388,32 +388,25 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 		return q.Result, nil
 	}

-	evaluate := func(ctx context.Context, r *ngmodels.AlertRule, attempt int64, e *evaluation) error {
+	evaluate := func(ctx context.Context, r *ngmodels.AlertRule, attempt int64, e *evaluation) {
 		logger := logger.New("version", r.Version, "attempt", attempt, "now", e.scheduledAt)
 		start := sch.clock.Now()

-		condition := ngmodels.Condition{
-			Condition: r.Condition,
-			OrgID:     r.OrgID,
-			Data:      r.Data,
-		}
-		results, err := sch.evaluator.ConditionEval(&condition, e.scheduledAt)
+		results := sch.evaluator.ConditionEval(r.GetEvalCondition(), e.scheduledAt)
 		dur := sch.clock.Now().Sub(start)
 		evalTotal.Inc()
 		evalDuration.Observe(dur.Seconds())
-		if err != nil {
+		if results.HasErrors() {
 			evalTotalFailures.Inc()
-			// consider saving alert instance on error
-			logger.Error("failed to evaluate alert rule", "duration", dur, "err", err)
-			return err
+			logger.Error("failed to evaluate alert rule", "results", results, "duration", dur)
+		} else {
+			logger.Debug("alert rule evaluated", "results", results, "duration", dur)
 		}
-		logger.Debug("alert rule evaluated", "results", results, "duration", dur)

 		processedStates := sch.stateManager.ProcessEvalResults(ctx, e.scheduledAt, r, results)
 		sch.saveAlertStates(ctx, processedStates)
 		alerts := FromAlertStateToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
 		sch.alertsSender.Send(key, alerts)
-		return nil
 	}

 	retryIfError := func(f func(attempt int64) error) error {
@ -475,7 +468,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 						currentRule = newRule
 						logger.Debug("new alert rule version fetched", "title", newRule.Title, "version", newRule.Version)
 					}
-					return evaluate(grafanaCtx, currentRule, attempt, ctx)
+					evaluate(grafanaCtx, currentRule, attempt, ctx)
+					return nil
 				})
 				if err != nil {
 					logger.Error("evaluation failed after all retries", "err", err)