mirror of
https://github.com/grafana/grafana.git
synced 2024-11-25 18:30:41 -06:00
Alerting: Add setting to distribute rule group evaluations over time (#80766)
* Simple, per-base-interval jitter * Add log just for test purposes * Add strategy approach, allow choosing between group or rule * Add flag to jitter rules * Add second toggle for jittering within a group * Wire up toggles to strategy * Slightly improve comment ordering * Add tests for offset generation * Rename JitterStrategyFrom * Improve debug log message * Use grafana SDK labels rather than prometheus labels
This commit is contained in:
parent
94c3be3b49
commit
00a260effa
@ -174,6 +174,7 @@ Experimental features might be changed or removed without prior notice.
|
||||
| `kubernetesFeatureToggles` | Use the kubernetes API for feature toggle management in the frontend |
|
||||
| `enablePluginsTracingByDefault` | Enable plugin tracing for all external plugins |
|
||||
| `newFolderPicker` | Enables the nested folder picker without having nested folders enabled |
|
||||
| `jitterAlertRules` | Distributes alert rule evaluations more evenly over time, by rule group |
|
||||
|
||||
## Development feature toggles
|
||||
|
||||
|
@ -176,4 +176,6 @@ export interface FeatureToggles {
|
||||
cloudRBACRoles?: boolean;
|
||||
alertingQueryOptimization?: boolean;
|
||||
newFolderPicker?: boolean;
|
||||
jitterAlertRules?: boolean;
|
||||
jitterAlertRulesWithinGroups?: boolean;
|
||||
}
|
||||
|
@ -1339,5 +1339,29 @@ var (
|
||||
FrontendOnly: true,
|
||||
Created: time.Date(2024, time.January, 12, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
{
|
||||
Name: "jitterAlertRules",
|
||||
Description: "Distributes alert rule evaluations more evenly over time, by rule group",
|
||||
FrontendOnly: false,
|
||||
Stage: FeatureStageExperimental,
|
||||
Owner: grafanaAlertingSquad,
|
||||
AllowSelfServe: false,
|
||||
HideFromDocs: false,
|
||||
HideFromAdminPage: false,
|
||||
RequiresRestart: true,
|
||||
Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
{
|
||||
Name: "jitterAlertRulesWithinGroups",
|
||||
Description: "Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group",
|
||||
FrontendOnly: false,
|
||||
Stage: FeatureStageExperimental,
|
||||
Owner: grafanaAlertingSquad,
|
||||
AllowSelfServe: false,
|
||||
HideFromDocs: true,
|
||||
HideFromAdminPage: false,
|
||||
RequiresRestart: true,
|
||||
Created: time.Date(2024, time.January, 17, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
@ -157,3 +157,5 @@ enablePluginsTracingByDefault,experimental,@grafana/plugins-platform-backend,202
|
||||
cloudRBACRoles,experimental,@grafana/identity-access-team,2024-01-10,false,false,true,false
|
||||
alertingQueryOptimization,GA,@grafana/alerting-squad,2024-01-10,false,false,false,false
|
||||
newFolderPicker,experimental,@grafana/grafana-frontend-platform,2024-01-12,false,false,false,true
|
||||
jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
||||
jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
||||
|
|
@ -638,4 +638,12 @@ const (
|
||||
// FlagNewFolderPicker
|
||||
// Enables the nested folder picker without having nested folders enabled
|
||||
FlagNewFolderPicker = "newFolderPicker"
|
||||
|
||||
// FlagJitterAlertRules
|
||||
// Distributes alert rule evaluations more evenly over time, by rule group
|
||||
FlagJitterAlertRules = "jitterAlertRules"
|
||||
|
||||
// FlagJitterAlertRulesWithinGroups
|
||||
// Distributes alert rule evaluations more evenly over time, including spreading out rules within the same group
|
||||
FlagJitterAlertRulesWithinGroups = "jitterAlertRulesWithinGroups"
|
||||
)
|
||||
|
@ -186,6 +186,12 @@ func WithInterval(interval time.Duration) AlertRuleMutator {
|
||||
}
|
||||
}
|
||||
|
||||
func WithIntervalBetween(min, max int64) AlertRuleMutator {
|
||||
return func(rule *AlertRule) {
|
||||
rule.IntervalSeconds = rand.Int63n(max-min) + min
|
||||
}
|
||||
}
|
||||
|
||||
func WithTitle(title string) AlertRuleMutator {
|
||||
return func(rule *AlertRule) {
|
||||
rule.Title = title
|
||||
|
@ -266,6 +266,7 @@ func (ng *AlertNG) init() error {
|
||||
BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval,
|
||||
MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval,
|
||||
DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel),
|
||||
JitterEvaluations: schedule.JitterStrategyFrom(ng.FeatureToggles),
|
||||
AppURL: appUrl,
|
||||
EvaluatorFactory: evalFactory,
|
||||
RuleStore: ng.store,
|
||||
|
66
pkg/services/ngalert/schedule/jitter.go
Normal file
66
pkg/services/ngalert/schedule/jitter.go
Normal file
@ -0,0 +1,66 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed.
|
||||
type JitterStrategy int
|
||||
|
||||
const (
|
||||
JitterNever JitterStrategy = iota
|
||||
JitterByGroup
|
||||
JitterByRule
|
||||
)
|
||||
|
||||
// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles.
|
||||
func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy {
|
||||
strategy := JitterNever
|
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) {
|
||||
strategy = JitterByGroup
|
||||
}
|
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) {
|
||||
strategy = JitterByRule
|
||||
}
|
||||
return strategy
|
||||
}
|
||||
|
||||
// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval.
|
||||
// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval.
|
||||
func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 {
|
||||
if strategy == JitterNever {
|
||||
return 0
|
||||
}
|
||||
|
||||
itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
offset := jitterHash(r, strategy) % uint64(itemFrequency)
|
||||
// Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64.
|
||||
// offset <= itemFrequency <= int64.max
|
||||
// So, this will not overflow and produce a negative offset.
|
||||
res := int64(offset)
|
||||
|
||||
// Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes.
|
||||
// Our contract requires that the result is nonnegative and less than int64.max.
|
||||
if res < 0 {
|
||||
return -res
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 {
|
||||
ls := data.Labels{
|
||||
"name": r.RuleGroup,
|
||||
"file": r.NamespaceUID,
|
||||
"orgId": fmt.Sprint(r.OrgID),
|
||||
}
|
||||
|
||||
if strategy == JitterByRule {
|
||||
ls["uid"] = r.UID
|
||||
}
|
||||
return uint64(ls.Fingerprint())
|
||||
}
|
100
pkg/services/ngalert/schedule/jitter_test.go
Normal file
100
pkg/services/ngalert/schedule/jitter_test.go
Normal file
@ -0,0 +1,100 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestJitter(t *testing.T) {
|
||||
t.Run("when strategy is JitterNever", func(t *testing.T) {
|
||||
t.Run("offset is always zero", func(t *testing.T) {
|
||||
rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600))
|
||||
baseInterval := 10 * time.Second
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterNever)
|
||||
require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("when strategy is JitterByGroup", func(t *testing.T) {
|
||||
t.Run("offset is stable for the same rule", func(t *testing.T) {
|
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
|
||||
baseInterval := 10 * time.Second
|
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup)
|
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
|
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset for any rule in the same group is always the same", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
group1 := ngmodels.AlertRuleGroupKey{}
|
||||
group2 := ngmodels.AlertRuleGroupKey{}
|
||||
rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1))
|
||||
rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2))
|
||||
|
||||
group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup)
|
||||
for _, r := range rules1 {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.Equal(t, group1Offset, offset)
|
||||
}
|
||||
group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup)
|
||||
for _, r := range rules2 {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup)
|
||||
require.Equal(t, group2Offset, offset)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("when strategy is JitterByRule", func(t *testing.T) {
|
||||
t.Run("offset is stable for the same rule", func(t *testing.T) {
|
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))()
|
||||
baseInterval := 10 * time.Second
|
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule)
|
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) {
|
||||
baseInterval := 10 * time.Second
|
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600))
|
||||
|
||||
for _, r := range rules {
|
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByRule)
|
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds)
|
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds())
|
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule {
|
||||
result := make([]*ngmodels.AlertRule, 0, n)
|
||||
for i := 0; i < n; i++ {
|
||||
result = append(result, ngmodels.AlertRuleGen(mutators...)())
|
||||
}
|
||||
return result
|
||||
}
|
@ -81,6 +81,7 @@ type schedule struct {
|
||||
|
||||
appURL *url.URL
|
||||
disableGrafanaFolder bool
|
||||
jitterEvaluations JitterStrategy
|
||||
|
||||
metrics *metrics.Scheduler
|
||||
|
||||
@ -104,6 +105,7 @@ type SchedulerCfg struct {
|
||||
MinRuleInterval time.Duration
|
||||
DisableGrafanaFolder bool
|
||||
AppURL *url.URL
|
||||
JitterEvaluations JitterStrategy
|
||||
EvaluatorFactory eval.EvaluatorFactory
|
||||
RuleStore RulesStore
|
||||
Metrics *metrics.Scheduler
|
||||
@ -131,6 +133,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
||||
metrics: cfg.Metrics,
|
||||
appURL: cfg.AppURL,
|
||||
disableGrafanaFolder: cfg.DisableGrafanaFolder,
|
||||
jitterEvaluations: cfg.JitterEvaluations,
|
||||
stateManager: stateManager,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
|
||||
@ -293,7 +296,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
}
|
||||
|
||||
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
|
||||
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
|
||||
offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
|
||||
isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0
|
||||
|
||||
var folderTitle string
|
||||
if !sch.disableGrafanaFolder {
|
||||
@ -306,6 +310,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
}
|
||||
|
||||
if isReadyToRun {
|
||||
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
|
||||
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
|
||||
scheduledAt: tick,
|
||||
rule: item,
|
||||
|
Loading…
Reference in New Issue
Block a user