Alerting: Log scheduler maxAttempts, guard against invalid retry counts, log retry errors (#80234)

* Log maxAttempts, add guard, log retry errors

* fix whitespace

* Initialize evaluator in TestProcessTicks
This commit is contained in:
Alexander Weaver 2024-01-09 13:19:37 -06:00 committed by GitHub
parent 1caaa56de0
commit 542741f748
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 10 deletions

View File

@ -114,6 +114,12 @@ type SchedulerCfg struct {
// NewScheduler returns a new schedule.
func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
const minMaxAttempts = int64(1)
if cfg.MaxAttempts < minMaxAttempts {
cfg.Log.Warn("Invalid scheduler maxAttempts, using a safe minimum", "configured", cfg.MaxAttempts, "actual", minMaxAttempts)
cfg.MaxAttempts = minMaxAttempts
}
sch := schedule{
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
maxAttempts: cfg.MaxAttempts,
@ -136,7 +142,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
}
func (sch *schedule) Run(ctx context.Context) error {
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval)
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval, "maxAttempts", sch.maxAttempts)
t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
defer t.Stop()
@ -383,6 +389,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
start := sch.clock.Now()
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule))
if sch.evaluatorFactory == nil {
panic("evalfactory nil")
}
ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition())
var results eval.Results
var dur time.Duration
@ -551,7 +560,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
return
}
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
select {
case <-tracingCtx.Done():
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)

View File

@ -24,6 +24,7 @@ import (
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
datasources "github.com/grafana/grafana/pkg/services/datasources/fakes"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
@ -66,15 +67,19 @@ func TestProcessTicks(t *testing.T) {
Host: "localhost",
}
cacheServ := &datasources.FakeCacheService{}
evaluator := eval.NewEvaluatorFactory(setting.UnifiedAlertingSettings{}, cacheServ, expr.ProvideService(&setting.Cfg{ExpressionsEnabled: true}, nil, nil, &featuremgmt.FeatureManager{}, nil, tracing.InitializeTracerForTest()), &pluginstore.FakePluginStore{})
schedCfg := SchedulerCfg{
BaseInterval: cfg.BaseInterval,
C: mockedClock,
AppURL: appUrl,
RuleStore: ruleStore,
Metrics: testMetrics.GetSchedulerMetrics(),
AlertSender: notifier,
Tracer: testTracer,
Log: log.New("ngalert.scheduler"),
BaseInterval: cfg.BaseInterval,
C: mockedClock,
AppURL: appUrl,
EvaluatorFactory: evaluator,
RuleStore: ruleStore,
Metrics: testMetrics.GetSchedulerMetrics(),
AlertSender: notifier,
Tracer: testTracer,
Log: log.New("ngalert.scheduler"),
}
managerCfg := state.ManagerCfg{
Metrics: testMetrics.GetStateMetrics(),