Alerting: Track recording rule health and last eval info ephemerally (#90247)

* Track health and last eval info * Read method for status * Minor tests
2025-02-25 18:55:37 -06:00 · 2024-07-11 14:05:09 -05:00 · 2024-07-11 14:05:09 -05:00 · ab32183e18
commit ab32183e18
parent 6874202dfa
2 changed files with 70 additions and 18 deletions
--- a/pkg/services/ngalert/schedule/recording_rule.go
+++ b/pkg/services/ngalert/schedule/recording_rule.go
@ -11,6 +11,7 @@ import (
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/trace"
+	"go.uber.org/atomic"

 	"github.com/grafana/grafana/pkg/cmd/grafana-cli/logger"
 	"github.com/grafana/grafana/pkg/infra/log"
@ -22,18 +23,30 @@ import (
 	"github.com/grafana/grafana/pkg/util"
 )

+type RuleStatus struct {
+	Health              string
+	LastError           error
+	EvaluationTimestamp time.Time
+	EvaluationDuration  time.Duration
+}
+
 type recordingRule struct {
 	key ngmodels.AlertRuleKey

-	ctx    context.Context
-	evalCh chan *Evaluation
-	stopFn util.CancelCauseFunc
+	ctx                 context.Context
+	evalCh              chan *Evaluation
+	stopFn              util.CancelCauseFunc
+	health              *atomic.String
+	lastError           *atomic.Error
+	evaluationTimestamp *atomic.Time
+	evaluationDuration  *atomic.Duration

 	maxAttempts int64

 	clock          clock.Clock
 	evalFactory    eval.EvaluatorFactory
 	featureToggles featuremgmt.FeatureToggles
+	writer         RecordingWriter

 	// Event hooks that are only used in tests.
 	evalAppliedHook evalAppliedFunc
@ -41,25 +54,36 @@ type recordingRule struct {
 	logger  log.Logger
 	metrics *metrics.Scheduler
 	tracer  tracing.Tracer
-
-	writer RecordingWriter
 }

 func newRecordingRule(parent context.Context, key ngmodels.AlertRuleKey, maxAttempts int64, clock clock.Clock, evalFactory eval.EvaluatorFactory, ft featuremgmt.FeatureToggles, logger log.Logger, metrics *metrics.Scheduler, tracer tracing.Tracer, writer RecordingWriter) *recordingRule {
 	ctx, stop := util.WithCancelCause(ngmodels.WithRuleKey(parent, key))
 	return &recordingRule{
-		key:            key,
-		ctx:            ctx,
-		evalCh:         make(chan *Evaluation),
-		stopFn:         stop,
-		clock:          clock,
-		evalFactory:    evalFactory,
-		featureToggles: ft,
-		maxAttempts:    maxAttempts,
-		logger:         logger.FromContext(ctx),
-		metrics:        metrics,
-		tracer:         tracer,
-		writer:         writer,
+		key:                 key,
+		ctx:                 ctx,
+		evalCh:              make(chan *Evaluation),
+		stopFn:              stop,
+		health:              atomic.NewString("unknown"),
+		lastError:           atomic.NewError(nil),
+		evaluationTimestamp: atomic.NewTime(time.Time{}),
+		evaluationDuration:  atomic.NewDuration(0),
+		clock:               clock,
+		evalFactory:         evalFactory,
+		featureToggles:      ft,
+		maxAttempts:         maxAttempts,
+		logger:              logger.FromContext(ctx),
+		metrics:             metrics,
+		tracer:              tracer,
+		writer:              writer,
+	}
+}
+
+func (r *recordingRule) Status() RuleStatus {
+	return RuleStatus{
+		Health:              r.health.Load(),
+		LastError:           r.lastError.Load(),
+		EvaluationTimestamp: r.evaluationTimestamp.Load(),
+		EvaluationDuration:  r.evaluationDuration.Load(),
 	}
 }

@ -127,7 +151,12 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {

 	defer func() {
 		evalTotal.Inc()
-		evalDuration.Observe(r.clock.Now().Sub(evalStart).Seconds())
+		end := r.clock.Now()
+		dur := end.Sub(evalStart)
+		evalDuration.Observe(dur.Seconds())
+		r.evaluationTimestamp.Store(end)
+		r.evaluationDuration.Store(dur)
+
 		r.evaluationDoneTestHook(ev)
 	}()

@ -183,11 +212,15 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {
 		evalTotalFailures.Inc()
 		span.SetStatus(codes.Error, "rule evaluation failed")
 		span.RecordError(latestError)
+		r.lastError.Store(latestError)
+		r.health.Store("error")
 		if r.maxAttempts > 0 {
 			logger.Error("Recording rule evaluation failed after all attempts", "lastError", latestError)
 		}
 	} else {
 		logger.Debug("Recording rule evaluation succeeded")
+		r.lastError.Store(nil)
+		r.health.Store("ok")
 	}
 }

@ -204,6 +237,7 @@ func (r *recordingRule) tryEvaluation(ctx context.Context, ev *Evaluation, logge
 	if err := eval.FindConditionError(result, ev.rule.Record.From); err != nil {
 		return fmt.Errorf("the query failed with an error: %w", err)
 	}
+	// TODO: This is missing dedicated logic for NoData. If NoData we can skip the write.

 	logger.Info("Recording rule evaluated", "results", result, "duration", evalDur)
 	span := trace.SpanFromContext(ctx)
--- a/pkg/services/ngalert/schedule/recording_rule_test.go
+++ b/pkg/services/ngalert/schedule/recording_rule_test.go
@ -171,6 +171,16 @@ func TestRecordingRule_Integration(t *testing.T) {
 	go func() {
 		_ = process.Run()
 	}()
+
+	t.Run("status shows no evaluations", func(t *testing.T) {
+		status := process.(*recordingRule).Status()
+
+		require.Equal(t, "unknown", status.Health)
+		require.Nil(t, status.LastError)
+		require.Zero(t, status.EvaluationTimestamp)
+		require.Zero(t, status.EvaluationDuration)
+	})
+
 	process.Eval(&Evaluation{
 		scheduledAt: now,
 		rule:        rule,
@ -178,6 +188,14 @@ func TestRecordingRule_Integration(t *testing.T) {
 	})
 	_ = waitForTimeChannel(t, evalDoneChan)

+	t.Run("status shows evaluation", func(t *testing.T) {
+		status := process.(*recordingRule).Status()
+
+		// TODO: Due to the randomness in the test, the rule randomly succeeds or fails.
+		// TODO: Solve this in a future PR, and assert something more strict here.
+		require.NotEqual(t, "unknown", status.Health)
+	})
+
 	t.Run("reports basic evaluation metrics", func(t *testing.T) {
 		expectedMetric := fmt.Sprintf(
 			`