Alerting: Add setting to distribute rule group evaluations over time (#80766)

* Simple, per-base-interval jitter

* Add log just for test purposes

* Add strategy approach, allow choosing between group or rule

* Add flag to jitter rules

* Add second toggle for jittering within a group

* Wire up toggles to strategy

* Slightly improve comment ordering

* Add tests for offset generation

* Rename JitterStrategyFrom

* Improve debug log message

* Use grafana SDK labels rather than prometheus labels
This commit is contained in:
Alexander Weaver 2024-01-18 12:48:11 -06:00 committed by GitHub
parent 94c3be3b49
commit 00a260effa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 216 additions and 1 deletions

View File

@ -174,6 +174,7 @@ Experimental features might be changed or removed without prior notice.
| `kubernetesFeatureToggles` | Use the kubernetes API for feature toggle management in the frontend |
| `enablePluginsTracingByDefault` | Enable plugin tracing for all external plugins |
| `newFolderPicker` | Enables the nested folder picker without having nested folders enabled |
| `jitterAlertRules` | Distributes alert rule evaluations more evenly over time, by rule group |
## Development feature toggles

View File

@ -176,4 +176,6 @@ export interface FeatureToggles {
cloudRBACRoles?: boolean;
alertingQueryOptimization?: boolean;
newFolderPicker?: boolean;
jitterAlertRules?: boolean;
jitterAlertRulesWithinGroups?: boolean;
}

View File

@ -1339,5 +1339,29 @@ var (
FrontendOnly: true,
Created: time.Date(2024, time.January, 12, 12, 0, 0, 0, time.UTC),
},
{
Name: "jitterAlertRules",
Description: "Distributes alert rule evaluations more evenly over time, by rule group",
FrontendOnly: false,
Stage: FeatureStageExperimental,
Owner: grafanaAlertingSquad,
AllowSelfServe: false,
HideFromDocs: false,
HideFromAdminPage: false,
RequiresRestart: true,
Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC),
},
{
Name: "jitterAlertRulesWithinGroups",
Description: "Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group",
FrontendOnly: false,
Stage: FeatureStageExperimental,
Owner: grafanaAlertingSquad,
AllowSelfServe: false,
HideFromDocs: true,
HideFromAdminPage: false,
RequiresRestart: true,
Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC),
},
}
)

View File

@ -157,3 +157,5 @@ enablePluginsTracingByDefault,experimental,@grafana/plugins-platform-backend,202
cloudRBACRoles,experimental,@grafana/identity-access-team,2024-01-10,false,false,true,false
alertingQueryOptimization,GA,@grafana/alerting-squad,2024-01-10,false,false,false,false
newFolderPicker,experimental,@grafana/grafana-frontend-platform,2024-01-12,false,false,false,true
jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false

1 Name Stage Owner Created requiresDevMode RequiresLicense RequiresRestart FrontendOnly
157 cloudRBACRoles experimental @grafana/identity-access-team 2024-01-10 false false true false
158 alertingQueryOptimization GA @grafana/alerting-squad 2024-01-10 false false false false
159 newFolderPicker experimental @grafana/grafana-frontend-platform 2024-01-12 false false false true
160 jitterAlertRules experimental @grafana/alerting-squad 2024-01-17 false false true false
161 jitterAlertRulesWithinGroups experimental @grafana/alerting-squad 2024-01-17 false false true false

View File

@ -638,4 +638,12 @@ const (
// FlagNewFolderPicker
// Enables the nested folder picker without having nested folders enabled
FlagNewFolderPicker = "newFolderPicker"
// FlagJitterAlertRules
// Distributes alert rule evaluations more evenly over time, by rule group
FlagJitterAlertRules = "jitterAlertRules"
// FlagJitterAlertRulesWithinGroups
// Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group
FlagJitterAlertRulesWithinGroups = "jitterAlertRulesWithinGroups"
)

View File

@ -186,6 +186,12 @@ func WithInterval(interval time.Duration) AlertRuleMutator {
}
}
func WithIntervalBetween(min, max int64) AlertRuleMutator {
return func(rule *AlertRule) {
rule.IntervalSeconds = rand.Int63n(max-min) + min
}
}
func WithTitle(title string) AlertRuleMutator {
return func(rule *AlertRule) {
rule.Title = title

View File

@ -266,6 +266,7 @@ func (ng *AlertNG) init() error {
BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval,
MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval,
DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel),
JitterEvaluations: schedule.JitterStrategyFrom(ng.FeatureToggles),
AppURL: appUrl,
EvaluatorFactory: evalFactory,
RuleStore: ng.store,

View File

@ -0,0 +1,66 @@
package schedule
import (
"fmt"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/featuremgmt"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed.
type JitterStrategy int
const (
JitterNever JitterStrategy = iota
JitterByGroup
JitterByRule
)
// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles.
func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy {
strategy := JitterNever
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) {
strategy = JitterByGroup
}
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) {
strategy = JitterByRule
}
return strategy
}
// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval.
// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval.
func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 {
if strategy == JitterNever {
return 0
}
itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds())
offset := jitterHash(r, strategy) % uint64(itemFrequency)
// Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64.
// offset <= itemFrequency <= int64.max
// So, this will not overflow and produce a negative offset.
res := int64(offset)
// Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes.
// Our contract requires that the result is nonnegative and less than int64.max.
if res < 0 {
return -res
}
return res
}
func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 {
ls := data.Labels{
"name": r.RuleGroup,
"file": r.NamespaceUID,
"orgId": fmt.Sprint(r.OrgID),
}
if strategy == JitterByRule {
ls["uid"] = r.UID
}
return uint64(ls.Fingerprint())
}

View File

@ -0,0 +1,100 @@
package schedule
import (
"testing"
"time"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/stretchr/testify/require"
)
func TestJitter(t *testing.T) {
t.Run("when strategy is JitterNever", func(t *testing.T) {
t.Run("offset is always zero", func(t *testing.T) {
rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600))
baseInterval := 10 * time.Second
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterNever)
require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset)
}
})
})
t.Run("when strategy is JitterByGroup", func(t *testing.T) {
t.Run("offset is stable for the same rule", func(t *testing.T) {
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
baseInterval := 10 * time.Second
original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
for i := 0; i < 100; i++ {
offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
}
})
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
baseInterval := 10 * time.Second
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
}
})
t.Run("offset for any rule in the same group is always the same", func(t *testing.T) {
baseInterval := 10 * time.Second
group1 := ngmodels.AlertRuleGroupKey{}
group2 := ngmodels.AlertRuleGroupKey{}
rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1))
rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2))
group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup)
for _, r := range rules1 {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.Equal(t, group1Offset, offset)
}
group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup)
for _, r := range rules2 {
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
require.Equal(t, group2Offset, offset)
}
})
})
t.Run("when strategy is JitterByRule", func(t *testing.T) {
t.Run("offset is stable for the same rule", func(t *testing.T) {
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
baseInterval := 10 * time.Second
original := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
for i := 0; i < 100; i++ {
offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
}
})
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
baseInterval := 10 * time.Second
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
for _, r := range rules {
offset := jitterOffsetInTicks(r, baseInterval, JitterByRule)
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
}
})
})
}
func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule {
result := make([]*ngmodels.AlertRule, 0, n)
for i := 0; i < n; i++ {
result = append(result, ngmodels.AlertRuleGen(mutators...)())
}
return result
}

View File

@ -81,6 +81,7 @@ type schedule struct {
appURL *url.URL
disableGrafanaFolder bool
jitterEvaluations JitterStrategy
metrics *metrics.Scheduler
@ -104,6 +105,7 @@ type SchedulerCfg struct {
MinRuleInterval time.Duration
DisableGrafanaFolder bool
AppURL *url.URL
JitterEvaluations JitterStrategy
EvaluatorFactory eval.EvaluatorFactory
RuleStore RulesStore
Metrics *metrics.Scheduler
@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
metrics: cfg.Metrics,
appURL: cfg.AppURL,
disableGrafanaFolder: cfg.DisableGrafanaFolder,
jitterEvaluations: cfg.JitterEvaluations,
stateManager: stateManager,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0
var folderTitle string
if !sch.disableGrafanaFolder {
@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
}
if isReadyToRun {
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
scheduledAt: tick,
rule: item,