Alerting: Log scheduler maxAttempts, guard against invalid retry counts, log retry errors (#80234)

* Log maxAttempts, add guard, log retry errors

* fix whitespace

* Initialize evaluator in TestProcessTicks
This commit is contained in:
Alexander Weaver 2024-01-09 13:19:37 -06:00 committed by GitHub
parent 1caaa56de0
commit 542741f748
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 10 deletions

View File

@ -114,6 +114,12 @@ type SchedulerCfg struct {
// NewScheduler returns a new schedule. // NewScheduler returns a new schedule.
func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule { func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
const minMaxAttempts = int64(1)
if cfg.MaxAttempts < minMaxAttempts {
cfg.Log.Warn("Invalid scheduler maxAttempts, using a safe minimum", "configured", cfg.MaxAttempts, "actual", minMaxAttempts)
cfg.MaxAttempts = minMaxAttempts
}
sch := schedule{ sch := schedule{
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)}, registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
maxAttempts: cfg.MaxAttempts, maxAttempts: cfg.MaxAttempts,
@ -136,7 +142,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
} }
func (sch *schedule) Run(ctx context.Context) error { func (sch *schedule) Run(ctx context.Context) error {
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval) sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval, "maxAttempts", sch.maxAttempts)
t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker) t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
defer t.Stop() defer t.Stop()
@ -383,6 +389,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
start := sch.clock.Now() start := sch.clock.Now()
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule)) evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule))
if sch.evaluatorFactory == nil {
panic("evalfactory nil")
}
ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition()) ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition())
var results eval.Results var results eval.Results
var dur time.Duration var dur time.Duration
@ -551,7 +560,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
return return
} }
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt) logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
select { select {
case <-tracingCtx.Done(): case <-tracingCtx.Done():
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt) logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)

View File

@ -24,6 +24,7 @@ import (
"github.com/grafana/grafana/pkg/expr" "github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/infra/tracing"
datasources "github.com/grafana/grafana/pkg/services/datasources/fakes"
"github.com/grafana/grafana/pkg/services/featuremgmt" "github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/eval"
@ -66,15 +67,19 @@ func TestProcessTicks(t *testing.T) {
Host: "localhost", Host: "localhost",
} }
cacheServ := &datasources.FakeCacheService{}
evaluator := eval.NewEvaluatorFactory(setting.UnifiedAlertingSettings{}, cacheServ, expr.ProvideService(&setting.Cfg{ExpressionsEnabled: true}, nil, nil, &featuremgmt.FeatureManager{}, nil, tracing.InitializeTracerForTest()), &pluginstore.FakePluginStore{})
schedCfg := SchedulerCfg{ schedCfg := SchedulerCfg{
BaseInterval: cfg.BaseInterval, BaseInterval: cfg.BaseInterval,
C: mockedClock, C: mockedClock,
AppURL: appUrl, AppURL: appUrl,
RuleStore: ruleStore, EvaluatorFactory: evaluator,
Metrics: testMetrics.GetSchedulerMetrics(), RuleStore: ruleStore,
AlertSender: notifier, Metrics: testMetrics.GetSchedulerMetrics(),
Tracer: testTracer, AlertSender: notifier,
Log: log.New("ngalert.scheduler"), Tracer: testTracer,
Log: log.New("ngalert.scheduler"),
} }
managerCfg := state.ManagerCfg{ managerCfg := state.ManagerCfg{
Metrics: testMetrics.GetStateMetrics(), Metrics: testMetrics.GetStateMetrics(),