Alerting: Add setting to distribute rule group evaluations over time (#80766)

* Simple, per-base-interval jitter

* Add log just for test purposes

* Add strategy approach, allow choosing between group or rule

* Add flag to jitter rules

* Add second toggle for jittering within a group

* Wire up toggles to strategy

* Slightly improve comment ordering

* Add tests for offset generation

* Rename JitterStrategyFrom

* Improve debug log message

* Use grafana SDK labels rather than prometheus labels
This commit is contained in:
Alexander Weaver
2024-01-18 12:48:11 -06:00
committed by GitHub
parent 94c3be3b49
commit 00a260effa
10 changed files with 216 additions and 1 deletions

View File

@@ -186,6 +186,12 @@ func WithInterval(interval time.Duration) AlertRuleMutator {
}
}
func WithIntervalBetween(min, max int64) AlertRuleMutator {
return func(rule *AlertRule) {
rule.IntervalSeconds = rand.Int63n(max-min) + min
}
}
func WithTitle(title string) AlertRuleMutator {
return func(rule *AlertRule) {
rule.Title = title

View File

@@ -266,6 +266,7 @@ func (ng *AlertNG) init() error {
BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval,
MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval,
DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel),
JitterEvaluations: schedule.JitterStrategyFrom(ng.FeatureToggles),
AppURL: appUrl,
EvaluatorFactory: evalFactory,
RuleStore: ng.store,

View File

@@ -0,0 +1,66 @@
package schedule
import (
"fmt"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/featuremgmt"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed.
type JitterStrategy int
const (
JitterNever JitterStrategy = iota
JitterByGroup
JitterByRule
)
// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles.
func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy {
strategy := JitterNever
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) {
strategy = JitterByGroup
}
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) {
strategy = JitterByRule
}
return strategy
}
// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval.
// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval.
func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 {
if strategy == JitterNever {
return 0
}
itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds())
offset := jitterHash(r, strategy) % uint64(itemFrequency)
// Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64.
// offset <= itemFrequency <= int64.max
// So, this will not overflow and produce a negative offset.
res := int64(offset)
// Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes.
// Our contract requires that the result is nonnegative and less than int64.max.
if res < 0 {
return -res
}
return res
}
func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 {
ls := data.Labels{
"name": r.RuleGroup,
"file": r.NamespaceUID,
"orgId": fmt.Sprint(r.OrgID),
}
if strategy == JitterByRule {
ls["uid"] = r.UID
}
return uint64(ls.Fingerprint())
}

View File

@@ -0,0 +1,100 @@
package schedule
import (
"testing"
"time"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/stretchr/testify/require"
)
func TestJitter(t *testing.T) {
t.Run("when strategy is JitterNever", func(t *testing.T) {
t.Run("offset is always zero", func(t *testing.T) {
rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600))
baseInterval := 10 * time.Second
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterNever)
require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset)
}
})
})
t.Run("when strategy is JitterByGroup", func(t *testing.T) {
t.Run("offset is stable for the same rule", func(t *testing.T) {
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
baseInterval := 10 * time.Second
original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
for i := 0; i < 100; i++ {
offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
}
})
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
baseInterval := 10 * time.Second
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
}
})
t.Run("offset for any rule in the same group is always the same", func(t *testing.T) {
baseInterval := 10 * time.Second
group1 := ngmodels.AlertRuleGroupKey{}
group2 := ngmodels.AlertRuleGroupKey{}
rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1))
rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2))
group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup)
for _, r := range rules1 {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.Equal(t, group1Offset, offset)
}
group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup)
for _, r := range rules2 {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.Equal(t, group2Offset, offset)
}
})
})
t.Run("when strategy is JitterByRule", func(t *testing.T) {
t.Run("offset is stable for the same rule", func(t *testing.T) {
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
baseInterval := 10 * time.Second
original := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
for i := 0; i < 100; i++ {
offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
}
})
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
baseInterval := 10 * time.Second
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterByRule)
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
}
})
})
}
func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule {
result := make([]*ngmodels.AlertRule, 0, n)
for i := 0; i < n; i++ {
result = append(result, ngmodels.AlertRuleGen(mutators...)())
}
return result
}

View File

@@ -81,6 +81,7 @@ type schedule struct {
appURL *url.URL
disableGrafanaFolder bool
jitterEvaluations JitterStrategy
metrics *metrics.Scheduler
@@ -104,6 +105,7 @@ type SchedulerCfg struct {
MinRuleInterval time.Duration
DisableGrafanaFolder bool
AppURL *url.URL
JitterEvaluations JitterStrategy
EvaluatorFactory eval.EvaluatorFactory
RuleStore RulesStore
Metrics *metrics.Scheduler
@@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
metrics: cfg.Metrics,
appURL: cfg.AppURL,
disableGrafanaFolder: cfg.DisableGrafanaFolder,
jitterEvaluations: cfg.JitterEvaluations,
stateManager: stateManager,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
@@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0
var folderTitle string
if !sch.disableGrafanaFolder {
@@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
if isReadyToRun {
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
scheduledAt: tick,
rule: item,