Alerting: Track recording rule health and last eval info ephemerally (#90247)

* Track health and last eval info

* Read method for status

* Minor tests
This commit is contained in:
Alexander Weaver 2024-07-11 14:05:09 -05:00 committed by GitHub
parent 6874202dfa
commit ab32183e18
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 70 additions and 18 deletions

View File

@ -11,6 +11,7 @@ import (
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"go.uber.org/atomic"
"github.com/grafana/grafana/pkg/cmd/grafana-cli/logger"
"github.com/grafana/grafana/pkg/infra/log"
@ -22,18 +23,30 @@ import (
"github.com/grafana/grafana/pkg/util"
)
type RuleStatus struct {
Health string
LastError error
EvaluationTimestamp time.Time
EvaluationDuration time.Duration
}
type recordingRule struct {
key ngmodels.AlertRuleKey
ctx context.Context
evalCh chan *Evaluation
stopFn util.CancelCauseFunc
ctx context.Context
evalCh chan *Evaluation
stopFn util.CancelCauseFunc
health *atomic.String
lastError *atomic.Error
evaluationTimestamp *atomic.Time
evaluationDuration *atomic.Duration
maxAttempts int64
clock clock.Clock
evalFactory eval.EvaluatorFactory
featureToggles featuremgmt.FeatureToggles
writer RecordingWriter
// Event hooks that are only used in tests.
evalAppliedHook evalAppliedFunc
@ -41,25 +54,36 @@ type recordingRule struct {
logger log.Logger
metrics *metrics.Scheduler
tracer tracing.Tracer
writer RecordingWriter
}
func newRecordingRule(parent context.Context, key ngmodels.AlertRuleKey, maxAttempts int64, clock clock.Clock, evalFactory eval.EvaluatorFactory, ft featuremgmt.FeatureToggles, logger log.Logger, metrics *metrics.Scheduler, tracer tracing.Tracer, writer RecordingWriter) *recordingRule {
ctx, stop := util.WithCancelCause(ngmodels.WithRuleKey(parent, key))
return &recordingRule{
key: key,
ctx: ctx,
evalCh: make(chan *Evaluation),
stopFn: stop,
clock: clock,
evalFactory: evalFactory,
featureToggles: ft,
maxAttempts: maxAttempts,
logger: logger.FromContext(ctx),
metrics: metrics,
tracer: tracer,
writer: writer,
key: key,
ctx: ctx,
evalCh: make(chan *Evaluation),
stopFn: stop,
health: atomic.NewString("unknown"),
lastError: atomic.NewError(nil),
evaluationTimestamp: atomic.NewTime(time.Time{}),
evaluationDuration: atomic.NewDuration(0),
clock: clock,
evalFactory: evalFactory,
featureToggles: ft,
maxAttempts: maxAttempts,
logger: logger.FromContext(ctx),
metrics: metrics,
tracer: tracer,
writer: writer,
}
}
func (r *recordingRule) Status() RuleStatus {
return RuleStatus{
Health: r.health.Load(),
LastError: r.lastError.Load(),
EvaluationTimestamp: r.evaluationTimestamp.Load(),
EvaluationDuration: r.evaluationDuration.Load(),
}
}
@ -127,7 +151,12 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {
defer func() {
evalTotal.Inc()
evalDuration.Observe(r.clock.Now().Sub(evalStart).Seconds())
end := r.clock.Now()
dur := end.Sub(evalStart)
evalDuration.Observe(dur.Seconds())
r.evaluationTimestamp.Store(end)
r.evaluationDuration.Store(dur)
r.evaluationDoneTestHook(ev)
}()
@ -183,11 +212,15 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {
evalTotalFailures.Inc()
span.SetStatus(codes.Error, "rule evaluation failed")
span.RecordError(latestError)
r.lastError.Store(latestError)
r.health.Store("error")
if r.maxAttempts > 0 {
logger.Error("Recording rule evaluation failed after all attempts", "lastError", latestError)
}
} else {
logger.Debug("Recording rule evaluation succeeded")
r.lastError.Store(nil)
r.health.Store("ok")
}
}
@ -204,6 +237,7 @@ func (r *recordingRule) tryEvaluation(ctx context.Context, ev *Evaluation, logge
if err := eval.FindConditionError(result, ev.rule.Record.From); err != nil {
return fmt.Errorf("the query failed with an error: %w", err)
}
// TODO: This is missing dedicated logic for NoData. If NoData we can skip the write.
logger.Info("Recording rule evaluated", "results", result, "duration", evalDur)
span := trace.SpanFromContext(ctx)

View File

@ -171,6 +171,16 @@ func TestRecordingRule_Integration(t *testing.T) {
go func() {
_ = process.Run()
}()
t.Run("status shows no evaluations", func(t *testing.T) {
status := process.(*recordingRule).Status()
require.Equal(t, "unknown", status.Health)
require.Nil(t, status.LastError)
require.Zero(t, status.EvaluationTimestamp)
require.Zero(t, status.EvaluationDuration)
})
process.Eval(&Evaluation{
scheduledAt: now,
rule: rule,
@ -178,6 +188,14 @@ func TestRecordingRule_Integration(t *testing.T) {
})
_ = waitForTimeChannel(t, evalDoneChan)
t.Run("status shows evaluation", func(t *testing.T) {
status := process.(*recordingRule).Status()
// TODO: Due to the randomness in the test, the rule randomly succeeds or fails.
// TODO: Solve this in a future PR, and assert something more strict here.
require.NotEqual(t, "unknown", status.Health)
})
t.Run("reports basic evaluation metrics", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`