mirror of
https://github.com/grafana/grafana.git
synced 2024-11-29 04:04:00 -06:00
Alerting: Track recording rule health and last eval info ephemerally (#90247)
* Track health and last eval info * Read method for status * Minor tests
This commit is contained in:
parent
6874202dfa
commit
ab32183e18
@ -11,6 +11,7 @@ import (
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
"go.uber.org/atomic"
|
||||
|
||||
"github.com/grafana/grafana/pkg/cmd/grafana-cli/logger"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
@ -22,18 +23,30 @@ import (
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
)
|
||||
|
||||
type RuleStatus struct {
|
||||
Health string
|
||||
LastError error
|
||||
EvaluationTimestamp time.Time
|
||||
EvaluationDuration time.Duration
|
||||
}
|
||||
|
||||
type recordingRule struct {
|
||||
key ngmodels.AlertRuleKey
|
||||
|
||||
ctx context.Context
|
||||
evalCh chan *Evaluation
|
||||
stopFn util.CancelCauseFunc
|
||||
ctx context.Context
|
||||
evalCh chan *Evaluation
|
||||
stopFn util.CancelCauseFunc
|
||||
health *atomic.String
|
||||
lastError *atomic.Error
|
||||
evaluationTimestamp *atomic.Time
|
||||
evaluationDuration *atomic.Duration
|
||||
|
||||
maxAttempts int64
|
||||
|
||||
clock clock.Clock
|
||||
evalFactory eval.EvaluatorFactory
|
||||
featureToggles featuremgmt.FeatureToggles
|
||||
writer RecordingWriter
|
||||
|
||||
// Event hooks that are only used in tests.
|
||||
evalAppliedHook evalAppliedFunc
|
||||
@ -41,25 +54,36 @@ type recordingRule struct {
|
||||
logger log.Logger
|
||||
metrics *metrics.Scheduler
|
||||
tracer tracing.Tracer
|
||||
|
||||
writer RecordingWriter
|
||||
}
|
||||
|
||||
func newRecordingRule(parent context.Context, key ngmodels.AlertRuleKey, maxAttempts int64, clock clock.Clock, evalFactory eval.EvaluatorFactory, ft featuremgmt.FeatureToggles, logger log.Logger, metrics *metrics.Scheduler, tracer tracing.Tracer, writer RecordingWriter) *recordingRule {
|
||||
ctx, stop := util.WithCancelCause(ngmodels.WithRuleKey(parent, key))
|
||||
return &recordingRule{
|
||||
key: key,
|
||||
ctx: ctx,
|
||||
evalCh: make(chan *Evaluation),
|
||||
stopFn: stop,
|
||||
clock: clock,
|
||||
evalFactory: evalFactory,
|
||||
featureToggles: ft,
|
||||
maxAttempts: maxAttempts,
|
||||
logger: logger.FromContext(ctx),
|
||||
metrics: metrics,
|
||||
tracer: tracer,
|
||||
writer: writer,
|
||||
key: key,
|
||||
ctx: ctx,
|
||||
evalCh: make(chan *Evaluation),
|
||||
stopFn: stop,
|
||||
health: atomic.NewString("unknown"),
|
||||
lastError: atomic.NewError(nil),
|
||||
evaluationTimestamp: atomic.NewTime(time.Time{}),
|
||||
evaluationDuration: atomic.NewDuration(0),
|
||||
clock: clock,
|
||||
evalFactory: evalFactory,
|
||||
featureToggles: ft,
|
||||
maxAttempts: maxAttempts,
|
||||
logger: logger.FromContext(ctx),
|
||||
metrics: metrics,
|
||||
tracer: tracer,
|
||||
writer: writer,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *recordingRule) Status() RuleStatus {
|
||||
return RuleStatus{
|
||||
Health: r.health.Load(),
|
||||
LastError: r.lastError.Load(),
|
||||
EvaluationTimestamp: r.evaluationTimestamp.Load(),
|
||||
EvaluationDuration: r.evaluationDuration.Load(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,7 +151,12 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {
|
||||
|
||||
defer func() {
|
||||
evalTotal.Inc()
|
||||
evalDuration.Observe(r.clock.Now().Sub(evalStart).Seconds())
|
||||
end := r.clock.Now()
|
||||
dur := end.Sub(evalStart)
|
||||
evalDuration.Observe(dur.Seconds())
|
||||
r.evaluationTimestamp.Store(end)
|
||||
r.evaluationDuration.Store(dur)
|
||||
|
||||
r.evaluationDoneTestHook(ev)
|
||||
}()
|
||||
|
||||
@ -183,11 +212,15 @@ func (r *recordingRule) doEvaluate(ctx context.Context, ev *Evaluation) {
|
||||
evalTotalFailures.Inc()
|
||||
span.SetStatus(codes.Error, "rule evaluation failed")
|
||||
span.RecordError(latestError)
|
||||
r.lastError.Store(latestError)
|
||||
r.health.Store("error")
|
||||
if r.maxAttempts > 0 {
|
||||
logger.Error("Recording rule evaluation failed after all attempts", "lastError", latestError)
|
||||
}
|
||||
} else {
|
||||
logger.Debug("Recording rule evaluation succeeded")
|
||||
r.lastError.Store(nil)
|
||||
r.health.Store("ok")
|
||||
}
|
||||
}
|
||||
|
||||
@ -204,6 +237,7 @@ func (r *recordingRule) tryEvaluation(ctx context.Context, ev *Evaluation, logge
|
||||
if err := eval.FindConditionError(result, ev.rule.Record.From); err != nil {
|
||||
return fmt.Errorf("the query failed with an error: %w", err)
|
||||
}
|
||||
// TODO: This is missing dedicated logic for NoData. If NoData we can skip the write.
|
||||
|
||||
logger.Info("Recording rule evaluated", "results", result, "duration", evalDur)
|
||||
span := trace.SpanFromContext(ctx)
|
||||
|
@ -171,6 +171,16 @@ func TestRecordingRule_Integration(t *testing.T) {
|
||||
go func() {
|
||||
_ = process.Run()
|
||||
}()
|
||||
|
||||
t.Run("status shows no evaluations", func(t *testing.T) {
|
||||
status := process.(*recordingRule).Status()
|
||||
|
||||
require.Equal(t, "unknown", status.Health)
|
||||
require.Nil(t, status.LastError)
|
||||
require.Zero(t, status.EvaluationTimestamp)
|
||||
require.Zero(t, status.EvaluationDuration)
|
||||
})
|
||||
|
||||
process.Eval(&Evaluation{
|
||||
scheduledAt: now,
|
||||
rule: rule,
|
||||
@ -178,6 +188,14 @@ func TestRecordingRule_Integration(t *testing.T) {
|
||||
})
|
||||
_ = waitForTimeChannel(t, evalDoneChan)
|
||||
|
||||
t.Run("status shows evaluation", func(t *testing.T) {
|
||||
status := process.(*recordingRule).Status()
|
||||
|
||||
// TODO: Due to the randomness in the test, the rule randomly succeeds or fails.
|
||||
// TODO: Solve this in a future PR, and assert something more strict here.
|
||||
require.NotEqual(t, "unknown", status.Health)
|
||||
})
|
||||
|
||||
t.Run("reports basic evaluation metrics", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`
|
||||
|
Loading…
Reference in New Issue
Block a user