Alerting: Log scheduler maxAttempts, guard against invalid retry counts, log retry errors (#80234)

* Log maxAttempts, add guard, log retry errors

* fix whitespace

* Initialize evaluator in TestProcessTicks
This commit is contained in:
Alexander Weaver
2024-01-09 13:19:37 -06:00
committed by GitHub
parent 1caaa56de0
commit 542741f748
2 changed files with 24 additions and 10 deletions

View File

@@ -114,6 +114,12 @@ type SchedulerCfg struct {
// NewScheduler returns a new schedule.
func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
const minMaxAttempts = int64(1)
if cfg.MaxAttempts < minMaxAttempts {
cfg.Log.Warn("Invalid scheduler maxAttempts, using a safe minimum", "configured", cfg.MaxAttempts, "actual", minMaxAttempts)
cfg.MaxAttempts = minMaxAttempts
}
sch := schedule{
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
maxAttempts: cfg.MaxAttempts,
@@ -136,7 +142,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
}
func (sch *schedule) Run(ctx context.Context) error {
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval)
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval, "maxAttempts", sch.maxAttempts)
t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
defer t.Stop()
@@ -383,6 +389,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
start := sch.clock.Now()
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule))
if sch.evaluatorFactory == nil {
panic("evalfactory nil")
}
ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition())
var results eval.Results
var dur time.Duration
@@ -551,7 +560,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
return
}
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
select {
case <-tracingCtx.Done():
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)