mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add setting to distribute rule group evaluations over time (#80766)
* Simple, per-base-interval jitter * Add log just for test purposes * Add strategy approach, allow choosing between group or rule * Add flag to jitter rules * Add second toggle for jittering within a group * Wire up toggles to strategy * Slightly improve comment ordering * Add tests for offset generation * Rename JitterStrategyFrom * Improve debug log message * Use grafana SDK labels rather than prometheus labels
This commit is contained in:
@@ -186,6 +186,12 @@ func WithInterval(interval time.Duration) AlertRuleMutator {
|
||||
}
|
||||
}
|
||||
|
||||
func WithIntervalBetween(min, max int64) AlertRuleMutator {
|
||||
return func(rule *AlertRule) {
|
||||
rule.IntervalSeconds = rand.Int63n(max-min) + min
|
||||
}
|
||||
}
|
||||
|
||||
func WithTitle(title string) AlertRuleMutator {
|
||||
return func(rule *AlertRule) {
|
||||
rule.Title = title
|
||||
|
||||
@@ -266,6 +266,7 @@ func (ng *AlertNG) init() error {
|
||||
BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval,
|
||||
MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval,
|
||||
DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel),
|
||||
JitterEvaluations: schedule.JitterStrategyFrom(ng.FeatureToggles),
|
||||
AppURL: appUrl,
|
||||
EvaluatorFactory: evalFactory,
|
||||
RuleStore: ng.store,
|
||||
|
||||
66
pkg/services/ngalert/schedule/jitter.go
Normal file
66
pkg/services/ngalert/schedule/jitter.go
Normal file
@@ -0,0 +1,66 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed.
|
||||
type JitterStrategy int
|
||||
|
||||
const (
|
||||
JitterNever JitterStrategy = iota
|
||||
JitterByGroup
|
||||
JitterByRule
|
||||
)
|
||||
|
||||
// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles.
|
||||
func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy {
|
||||
strategy := JitterNever
|
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) {
|
||||
strategy = JitterByGroup
|
||||
}
|
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) {
|
||||
strategy = JitterByRule
|
||||
}
|
||||
return strategy
|
||||
}
|
||||
|
||||
// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval.
|
||||
// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval.
|
||||
func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 {
|
||||
if strategy == JitterNever {
|
||||
return 0
|
||||
}
|
||||
|
||||
itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
offset := jitterHash(r, strategy) % uint64(itemFrequency)
|
||||
// Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64.
|
||||
// offset <= itemFrequency <= int64.max
|
||||
// So, this will not overflow and produce a negative offset.
|
||||
res := int64(offset)
|
||||
|
||||
// Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes.
|
||||
// Our contract requires that the result is nonnegative and less than int64.max.
|
||||
if res < 0 {
|
||||
return -res
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 {
|
||||
ls := data.Labels{
|
||||
"name": r.RuleGroup,
|
||||
"file": r.NamespaceUID,
|
||||
"orgId": fmt.Sprint(r.OrgID),
|
||||
}
|
||||
|
||||
if strategy == JitterByRule {
|
||||
ls["uid"] = r.UID
|
||||
}
|
||||
return uint64(ls.Fingerprint())
|
||||
}
|
||||
100
pkg/services/ngalert/schedule/jitter_test.go
Normal file
100
pkg/services/ngalert/schedule/jitter_test.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestJitter(t *testing.T) {
|
||||
t.Run("when strategy is JitterNever", func(t *testing.T) {
|
||||
t.Run("offset is always zero", func(t *testing.T) {
|
||||
rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600))
|
||||
baseInterval := 10 * time.Second
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterNever)
|
||||
require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("when strategy is JitterByGroup", func(t *testing.T) {
|
||||
t.Run("offset is stable for the same rule", func(t *testing.T) {
|
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
|
||||
baseInterval := 10 * time.Second
|
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
|
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
|
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset for any rule in the same group is always the same", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
group1 := ngmodels.AlertRuleGroupKey{}
|
||||
group2 := ngmodels.AlertRuleGroupKey{}
|
||||
rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1))
|
||||
rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2))
|
||||
|
||||
group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup)
|
||||
for _, r := range rules1 {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.Equal(t, group1Offset, offset)
|
||||
}
|
||||
group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup)
|
||||
for _, r := range rules2 {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.Equal(t, group2Offset, offset)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("when strategy is JitterByRule", func(t *testing.T) {
|
||||
t.Run("offset is stable for the same rule", func(t *testing.T) {
|
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
|
||||
baseInterval := 10 * time.Second
|
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
|
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByRule)
|
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
|
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule {
|
||||
result := make([]*ngmodels.AlertRule, 0, n)
|
||||
for i := 0; i < n; i++ {
|
||||
result = append(result, ngmodels.AlertRuleGen(mutators...)())
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -81,6 +81,7 @@ type schedule struct {
|
||||
|
||||
appURL *url.URL
|
||||
disableGrafanaFolder bool
|
||||
jitterEvaluations JitterStrategy
|
||||
|
||||
metrics *metrics.Scheduler
|
||||
|
||||
@@ -104,6 +105,7 @@ type SchedulerCfg struct {
|
||||
MinRuleInterval time.Duration
|
||||
DisableGrafanaFolder bool
|
||||
AppURL *url.URL
|
||||
JitterEvaluations JitterStrategy
|
||||
EvaluatorFactory eval.EvaluatorFactory
|
||||
RuleStore RulesStore
|
||||
Metrics *metrics.Scheduler
|
||||
@@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
||||
metrics: cfg.Metrics,
|
||||
appURL: cfg.AppURL,
|
||||
disableGrafanaFolder: cfg.DisableGrafanaFolder,
|
||||
jitterEvaluations: cfg.JitterEvaluations,
|
||||
stateManager: stateManager,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
|
||||
@@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
}
|
||||
|
||||
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
|
||||
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
|
||||
offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
|
||||
isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0
|
||||
|
||||
var folderTitle string
|
||||
if !sch.disableGrafanaFolder {
|
||||
@@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
}
|
||||
|
||||
if isReadyToRun {
|
||||
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
|
||||
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
|
||||
scheduledAt: tick,
|
||||
rule: item,
|
||||
|
||||
Reference in New Issue
Block a user