From 00a260effab802edc8f72df50bfb6447aac343f0 Mon Sep 17 00:00:00 2001 From: Alexander Weaver Date: Thu, 18 Jan 2024 12:48:11 -0600 Subject: [PATCH] Alerting: Add setting to distribute rule group evaluations over time (#80766) * Simple, per-base-interval jitter * Add log just for test purposes * Add strategy approach, allow choosing between group or rule * Add flag to jitter rules * Add second toggle for jittering within a group * Wire up toggles to strategy * Slightly improve comment ordering * Add tests for offset generation * Rename JitterStrategyFrom * Improve debug log message * Use grafana SDK labels rather than prometheus labels --- .../feature-toggles/index.md | 1 + .../src/types/featureToggles.gen.ts | 2 + pkg/services/featuremgmt/registry.go | 24 +++++ pkg/services/featuremgmt/toggles_gen.csv | 2 + pkg/services/featuremgmt/toggles_gen.go | 8 ++ pkg/services/ngalert/models/testing.go | 6 ++ pkg/services/ngalert/ngalert.go | 1 + pkg/services/ngalert/schedule/jitter.go | 66 ++++++++++++ pkg/services/ngalert/schedule/jitter_test.go | 100 ++++++++++++++++++ pkg/services/ngalert/schedule/schedule.go | 7 +- 10 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 pkg/services/ngalert/schedule/jitter.go create mode 100644 pkg/services/ngalert/schedule/jitter_test.go diff --git a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md index 1fe442a3761..17fbbb7e79f 100644 --- a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md +++ b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md @@ -174,6 +174,7 @@ Experimental features might be changed or removed without prior notice. | `kubernetesFeatureToggles` | Use the kubernetes API for feature toggle management in the frontend | | `enablePluginsTracingByDefault` | Enable plugin tracing for all external plugins | | `newFolderPicker` | Enables the nested folder picker without having nested folders enabled | +| `jitterAlertRules` | Distributes alert rule evaluations more evenly over time, by rule group | ## Development feature toggles diff --git a/packages/grafana-data/src/types/featureToggles.gen.ts b/packages/grafana-data/src/types/featureToggles.gen.ts index 30cd52b017a..9891545e5f3 100644 --- a/packages/grafana-data/src/types/featureToggles.gen.ts +++ b/packages/grafana-data/src/types/featureToggles.gen.ts @@ -176,4 +176,6 @@ export interface FeatureToggles { cloudRBACRoles?: boolean; alertingQueryOptimization?: boolean; newFolderPicker?: boolean; + jitterAlertRules?: boolean; + jitterAlertRulesWithinGroups?: boolean; } diff --git a/pkg/services/featuremgmt/registry.go b/pkg/services/featuremgmt/registry.go index 7115a48bd46..7ce43ecce8a 100644 --- a/pkg/services/featuremgmt/registry.go +++ b/pkg/services/featuremgmt/registry.go @@ -1339,5 +1339,29 @@ var ( FrontendOnly: true, Created: time.Date(2024, time.January, 12, 12, 0, 0, 0, time.UTC), }, + { + Name: "jitterAlertRules", + Description: "Distributes alert rule evaluations more evenly over time, by rule group", + FrontendOnly: false, + Stage: FeatureStageExperimental, + Owner: grafanaAlertingSquad, + AllowSelfServe: false, + HideFromDocs: false, + HideFromAdminPage: false, + RequiresRestart: true, + Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC), + }, + { + Name: "jitterAlertRulesWithinGroups", + Description: "Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group", + FrontendOnly: false, + Stage: FeatureStageExperimental, + Owner: grafanaAlertingSquad, + AllowSelfServe: false, + HideFromDocs: true, + HideFromAdminPage: false, + RequiresRestart: true, + Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC), + }, } ) diff --git a/pkg/services/featuremgmt/toggles_gen.csv b/pkg/services/featuremgmt/toggles_gen.csv index d3a8c88e029..5c16412b32a 100644 --- a/pkg/services/featuremgmt/toggles_gen.csv +++ b/pkg/services/featuremgmt/toggles_gen.csv @@ -157,3 +157,5 @@ enablePluginsTracingByDefault,experimental,@grafana/plugins-platform-backend,202 cloudRBACRoles,experimental,@grafana/identity-access-team,2024-01-10,false,false,true,false alertingQueryOptimization,GA,@grafana/alerting-squad,2024-01-10,false,false,false,false newFolderPicker,experimental,@grafana/grafana-frontend-platform,2024-01-12,false,false,false,true +jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false +jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false diff --git a/pkg/services/featuremgmt/toggles_gen.go b/pkg/services/featuremgmt/toggles_gen.go index fbd8775a7b4..94df72a85f4 100644 --- a/pkg/services/featuremgmt/toggles_gen.go +++ b/pkg/services/featuremgmt/toggles_gen.go @@ -638,4 +638,12 @@ const ( // FlagNewFolderPicker // Enables the nested folder picker without having nested folders enabled FlagNewFolderPicker = "newFolderPicker" + + // FlagJitterAlertRules + // Distributes alert rule evaluations more evenly over time, by rule group + FlagJitterAlertRules = "jitterAlertRules" + + // FlagJitterAlertRulesWithinGroups + // Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group + FlagJitterAlertRulesWithinGroups = "jitterAlertRulesWithinGroups" ) diff --git a/pkg/services/ngalert/models/testing.go b/pkg/services/ngalert/models/testing.go index f81ee01cc41..50ee3e4f60b 100644 --- a/pkg/services/ngalert/models/testing.go +++ b/pkg/services/ngalert/models/testing.go @@ -186,6 +186,12 @@ func WithInterval(interval time.Duration) AlertRuleMutator { } } +func WithIntervalBetween(min, max int64) AlertRuleMutator { + return func(rule *AlertRule) { + rule.IntervalSeconds = rand.Int63n(max-min) + min + } +} + func WithTitle(title string) AlertRuleMutator { return func(rule *AlertRule) { rule.Title = title diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index a54db4db1d0..09e8412ff29 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -266,6 +266,7 @@ func (ng *AlertNG) init() error { BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval, MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval, DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel), + JitterEvaluations: schedule.JitterStrategyFrom(ng.FeatureToggles), AppURL: appUrl, EvaluatorFactory: evalFactory, RuleStore: ng.store, diff --git a/pkg/services/ngalert/schedule/jitter.go b/pkg/services/ngalert/schedule/jitter.go new file mode 100644 index 00000000000..127272c950a --- /dev/null +++ b/pkg/services/ngalert/schedule/jitter.go @@ -0,0 +1,66 @@ +package schedule + +import ( + "fmt" + "time" + + "github.com/grafana/grafana-plugin-sdk-go/data" + "github.com/grafana/grafana/pkg/services/featuremgmt" + ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" +) + +// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed. +type JitterStrategy int + +const ( + JitterNever JitterStrategy = iota + JitterByGroup + JitterByRule +) + +// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles. +func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy { + strategy := JitterNever + if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) { + strategy = JitterByGroup + } + if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) { + strategy = JitterByRule + } + return strategy +} + +// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval. +// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval. +func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 { + if strategy == JitterNever { + return 0 + } + + itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds()) + offset := jitterHash(r, strategy) % uint64(itemFrequency) + // Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64. + // offset <= itemFrequency <= int64.max + // So, this will not overflow and produce a negative offset. + res := int64(offset) + + // Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes. + // Our contract requires that the result is nonnegative and less than int64.max. + if res < 0 { + return -res + } + return res +} + +func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 { + ls := data.Labels{ + "name": r.RuleGroup, + "file": r.NamespaceUID, + "orgId": fmt.Sprint(r.OrgID), + } + + if strategy == JitterByRule { + ls["uid"] = r.UID + } + return uint64(ls.Fingerprint()) +} diff --git a/pkg/services/ngalert/schedule/jitter_test.go b/pkg/services/ngalert/schedule/jitter_test.go new file mode 100644 index 00000000000..ae7ead86fe7 --- /dev/null +++ b/pkg/services/ngalert/schedule/jitter_test.go @@ -0,0 +1,100 @@ +package schedule + +import ( + "testing" + "time" + + ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" + "github.com/stretchr/testify/require" +) + +func TestJitter(t *testing.T) { + t.Run("when strategy is JitterNever", func(t *testing.T) { + t.Run("offset is always zero", func(t *testing.T) { + rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600)) + baseInterval := 10 * time.Second + + for _, r := range rules { + offset := jitterOffsetInTicks(r, baseInterval, JitterNever) + require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset) + } + }) + }) + + t.Run("when strategy is JitterByGroup", func(t *testing.T) { + t.Run("offset is stable for the same rule", func(t *testing.T) { + rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))() + baseInterval := 10 * time.Second + original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup) + + for i := 0; i < 100; i++ { + offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup) + require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule") + } + }) + + t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) { + baseInterval := 10 * time.Second + rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600)) + + for _, r := range rules { + offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) + require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds) + upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds()) + require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit) + } + }) + + t.Run("offset for any rule in the same group is always the same", func(t *testing.T) { + baseInterval := 10 * time.Second + group1 := ngmodels.AlertRuleGroupKey{} + group2 := ngmodels.AlertRuleGroupKey{} + rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1)) + rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2)) + + group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup) + for _, r := range rules1 { + offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) + require.Equal(t, group1Offset, offset) + } + group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup) + for _, r := range rules2 { + offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) + require.Equal(t, group2Offset, offset) + } + }) + }) + + t.Run("when strategy is JitterByRule", func(t *testing.T) { + t.Run("offset is stable for the same rule", func(t *testing.T) { + rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))() + baseInterval := 10 * time.Second + original := jitterOffsetInTicks(rule, baseInterval, JitterByRule) + + for i := 0; i < 100; i++ { + offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule) + require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule") + } + }) + + t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) { + baseInterval := 10 * time.Second + rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600)) + + for _, r := range rules { + offset := jitterOffsetInTicks(r, baseInterval, JitterByRule) + require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds) + upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds()) + require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit) + } + }) + }) +} + +func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule { + result := make([]*ngmodels.AlertRule, 0, n) + for i := 0; i < n; i++ { + result = append(result, ngmodels.AlertRuleGen(mutators...)()) + } + return result +} diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 0104d3bcdad..98e947b34b9 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -81,6 +81,7 @@ type schedule struct { appURL *url.URL disableGrafanaFolder bool + jitterEvaluations JitterStrategy metrics *metrics.Scheduler @@ -104,6 +105,7 @@ type SchedulerCfg struct { MinRuleInterval time.Duration DisableGrafanaFolder bool AppURL *url.URL + JitterEvaluations JitterStrategy EvaluatorFactory eval.EvaluatorFactory RuleStore RulesStore Metrics *metrics.Scheduler @@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule { metrics: cfg.Metrics, appURL: cfg.AppURL, disableGrafanaFolder: cfg.DisableGrafanaFolder, + jitterEvaluations: cfg.JitterEvaluations, stateManager: stateManager, minRuleInterval: cfg.MinRuleInterval, schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)}, @@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup. } itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds()) - isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 + offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations) + isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0 var folderTitle string if !sch.disableGrafanaFolder { @@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup. } if isReadyToRun { + sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset) readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{ scheduledAt: tick, rule: item,