mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Log scheduler maxAttempts, guard against invalid retry counts, log retry errors (#80234)
* Log maxAttempts, add guard, log retry errors * fix whitespace * Initialize evaluator in TestProcessTicks
This commit is contained in:
parent
1caaa56de0
commit
542741f748
@ -114,6 +114,12 @@ type SchedulerCfg struct {
|
|||||||
|
|
||||||
// NewScheduler returns a new schedule.
|
// NewScheduler returns a new schedule.
|
||||||
func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
||||||
|
const minMaxAttempts = int64(1)
|
||||||
|
if cfg.MaxAttempts < minMaxAttempts {
|
||||||
|
cfg.Log.Warn("Invalid scheduler maxAttempts, using a safe minimum", "configured", cfg.MaxAttempts, "actual", minMaxAttempts)
|
||||||
|
cfg.MaxAttempts = minMaxAttempts
|
||||||
|
}
|
||||||
|
|
||||||
sch := schedule{
|
sch := schedule{
|
||||||
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
|
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
|
||||||
maxAttempts: cfg.MaxAttempts,
|
maxAttempts: cfg.MaxAttempts,
|
||||||
@ -136,7 +142,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (sch *schedule) Run(ctx context.Context) error {
|
func (sch *schedule) Run(ctx context.Context) error {
|
||||||
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval)
|
sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval, "maxAttempts", sch.maxAttempts)
|
||||||
t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
|
t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
|
||||||
defer t.Stop()
|
defer t.Stop()
|
||||||
|
|
||||||
@ -383,6 +389,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
start := sch.clock.Now()
|
start := sch.clock.Now()
|
||||||
|
|
||||||
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule))
|
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), sch.newLoadedMetricsReader(e.rule))
|
||||||
|
if sch.evaluatorFactory == nil {
|
||||||
|
panic("evalfactory nil")
|
||||||
|
}
|
||||||
ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition())
|
ruleEval, err := sch.evaluatorFactory.Create(evalCtx, e.rule.GetEvalCondition())
|
||||||
var results eval.Results
|
var results eval.Results
|
||||||
var dur time.Duration
|
var dur time.Duration
|
||||||
@ -551,7 +560,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
|
||||||
select {
|
select {
|
||||||
case <-tracingCtx.Done():
|
case <-tracingCtx.Done():
|
||||||
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
||||||
|
@ -24,6 +24,7 @@ import (
|
|||||||
"github.com/grafana/grafana/pkg/expr"
|
"github.com/grafana/grafana/pkg/expr"
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
"github.com/grafana/grafana/pkg/infra/tracing"
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
|
datasources "github.com/grafana/grafana/pkg/services/datasources/fakes"
|
||||||
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
@ -66,15 +67,19 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
Host: "localhost",
|
Host: "localhost",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cacheServ := &datasources.FakeCacheService{}
|
||||||
|
evaluator := eval.NewEvaluatorFactory(setting.UnifiedAlertingSettings{}, cacheServ, expr.ProvideService(&setting.Cfg{ExpressionsEnabled: true}, nil, nil, &featuremgmt.FeatureManager{}, nil, tracing.InitializeTracerForTest()), &pluginstore.FakePluginStore{})
|
||||||
|
|
||||||
schedCfg := SchedulerCfg{
|
schedCfg := SchedulerCfg{
|
||||||
BaseInterval: cfg.BaseInterval,
|
BaseInterval: cfg.BaseInterval,
|
||||||
C: mockedClock,
|
C: mockedClock,
|
||||||
AppURL: appUrl,
|
AppURL: appUrl,
|
||||||
RuleStore: ruleStore,
|
EvaluatorFactory: evaluator,
|
||||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
RuleStore: ruleStore,
|
||||||
AlertSender: notifier,
|
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||||
Tracer: testTracer,
|
AlertSender: notifier,
|
||||||
Log: log.New("ngalert.scheduler"),
|
Tracer: testTracer,
|
||||||
|
Log: log.New("ngalert.scheduler"),
|
||||||
}
|
}
|
||||||
managerCfg := state.ManagerCfg{
|
managerCfg := state.ManagerCfg{
|
||||||
Metrics: testMetrics.GetStateMetrics(),
|
Metrics: testMetrics.GetStateMetrics(),
|
||||||
|
Loading…
Reference in New Issue
Block a user