Alerting: Add setting to distribute rule group evaluations over time (#80766)

* Simple, per-base-interval jitter

* Add log just for test purposes

* Add strategy approach, allow choosing between group or rule

* Add flag to jitter rules

* Add second toggle for jittering within a group

* Wire up toggles to strategy

* Slightly improve comment ordering

* Add tests for offset generation

* Rename JitterStrategyFrom

* Improve debug log message

* Use grafana SDK labels rather than prometheus labels
This commit is contained in:
Alexander Weaver
2024-01-18 12:48:11 -06:00
committed by GitHub
parent 94c3be3b49
commit 00a260effa
10 changed files with 216 additions and 1 deletions

View File

@@ -81,6 +81,7 @@ type schedule struct {
appURL *url.URL
disableGrafanaFolder bool
jitterEvaluations JitterStrategy
metrics *metrics.Scheduler
@@ -104,6 +105,7 @@ type SchedulerCfg struct {
MinRuleInterval time.Duration
DisableGrafanaFolder bool
AppURL *url.URL
JitterEvaluations JitterStrategy
EvaluatorFactory eval.EvaluatorFactory
RuleStore RulesStore
Metrics *metrics.Scheduler
@@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
metrics: cfg.Metrics,
appURL: cfg.AppURL,
disableGrafanaFolder: cfg.DisableGrafanaFolder,
jitterEvaluations: cfg.JitterEvaluations,
stateManager: stateManager,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
@@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0
var folderTitle string
if !sch.disableGrafanaFolder {
@@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
if isReadyToRun {
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
scheduledAt: tick,
rule: item,