diff --git a/pkg/services/ngalert/CHANGELOG.md b/pkg/services/ngalert/CHANGELOG.md index 5e8e88bc156..46e0dfa9033 100644 --- a/pkg/services/ngalert/CHANGELOG.md +++ b/pkg/services/ngalert/CHANGELOG.md @@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u ## Grafana Alerting - main / unreleased +- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled. - [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190 - `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds` - `grafana_alerting_ticker_next_tick_timestamp_seconds` diff --git a/pkg/services/ngalert/metrics/ngalert.go b/pkg/services/ngalert/metrics/ngalert.go index 916ac41a21f..2cb3f31f6cd 100644 --- a/pkg/services/ngalert/metrics/ngalert.go +++ b/pkg/services/ngalert/metrics/ngalert.go @@ -46,14 +46,16 @@ type NGAlert struct { } type Scheduler struct { - Registerer prometheus.Registerer - BehindSeconds prometheus.Gauge - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - EvalDuration *prometheus.SummaryVec - GetAlertRulesDuration prometheus.Histogram - SchedulePeriodicDuration prometheus.Histogram - Ticker *legacyMetrics.Ticker + Registerer prometheus.Registerer + BehindSeconds prometheus.Gauge + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + EvalDuration *prometheus.SummaryVec + SchedulePeriodicDuration prometheus.Histogram + SchedulableAlertRules prometheus.Gauge + SchedulableAlertRulesHash prometheus.Gauge + UpdateSchedulableAlertRulesDuration prometheus.Histogram + Ticker *legacyMetrics.Ticker } type MultiOrgAlertmanager struct { @@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler { }, []string{"org"}, ), - GetAlertRulesDuration: promauto.With(r).NewHistogram( - prometheus.HistogramOpts{ - Namespace: Namespace, - Subsystem: Subsystem, - Name: "get_alert_rules_duration_seconds", - Help: "The time taken to get all alert rules.", - Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10}, - }, - ), SchedulePeriodicDuration: promauto.With(r).NewHistogram( prometheus.HistogramOpts{ Namespace: Namespace, @@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler { Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10}, }, ), + SchedulableAlertRules: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "schedule_alert_rules", + Help: "The number of alert rules being considered for evaluation each tick.", + }, + ), + SchedulableAlertRulesHash: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "schedule_alert_rules_hash", + Help: "A hash of the alert rules over time.", + }), + UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "schedule_query_alert_rules_duration_seconds", + Help: "The time taken to fetch alert rules from the database.", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10}, + }, + ), Ticker: legacyMetrics.NewTickerMetrics(r), } } diff --git a/pkg/services/ngalert/schedule/fetcher.go b/pkg/services/ngalert/schedule/fetcher.go index 7a2cf1d55b8..1175ad4b27d 100644 --- a/pkg/services/ngalert/schedule/fetcher.go +++ b/pkg/services/ngalert/schedule/fetcher.go @@ -2,24 +2,55 @@ package schedule import ( "context" + "fmt" + "hash/fnv" + "sort" "time" "github.com/grafana/grafana/pkg/services/ngalert/models" ) -func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule { +// hashUIDs returns a fnv64 hash of the UIDs for all alert rules. +// The order of the alert rules does not matter as hashUIDs sorts +// the UIDs in increasing order. +func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 { + h := fnv.New64() + for _, uid := range sortedUIDs(alertRules) { + // We can ignore err as fnv64 does not return an error + // nolint:errcheck,gosec + h.Write([]byte(uid)) + } + return h.Sum64() +} + +// sortedUIDs returns a slice of sorted UIDs. +func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string { + uids := make([]string, 0, len(alertRules)) + for _, alertRule := range alertRules { + uids = append(uids, alertRule.UID) + } + sort.Strings(uids) + return uids +} + +// updateSchedulableAlertRules updates the alert rules for the scheduler. +// It returns an error if the database is unavailable or the query returned +// an error. +func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error { start := time.Now() defer func() { - sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds()) + sch.metrics.UpdateSchedulableAlertRulesDuration.Observe( + time.Since(start).Seconds()) }() q := models.GetAlertRulesForSchedulingQuery{ ExcludeOrgIDs: disabledOrgs, } - err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q) - if err != nil { - sch.log.Error("failed to fetch alert definitions", "err", err) - return nil + if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil { + return fmt.Errorf("failed to get alert rules: %w", err) } - return q.Result + sch.schedulableAlertRules.set(q.Result) + sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result))) + sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result))) + return nil } diff --git a/pkg/services/ngalert/schedule/fetcher_test.go b/pkg/services/ngalert/schedule/fetcher_test.go new file mode 100644 index 00000000000..1a9e597702b --- /dev/null +++ b/pkg/services/ngalert/schedule/fetcher_test.go @@ -0,0 +1,26 @@ +package schedule + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/grafana/grafana/pkg/services/ngalert/models" +) + +func TestHashUIDs(t *testing.T) { + r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}} + assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r)) + // expect the same hash irrespective of order + r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}} + assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r)) + // expect a different hash + r = []*models.SchedulableAlertRule{{UID: "bar"}} + assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r)) + // slice with no items + r = []*models.SchedulableAlertRule{} + assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r)) + // a different slice with no items should have the same hash + r = []*models.SchedulableAlertRule{} + assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r)) +} diff --git a/pkg/services/ngalert/schedule/registry.go b/pkg/services/ngalert/schedule/registry.go index 1d550c82738..9a267917dd5 100644 --- a/pkg/services/ngalert/schedule/registry.go +++ b/pkg/services/ngalert/schedule/registry.go @@ -9,14 +9,14 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/models" ) -type alertRuleRegistry struct { +type alertRuleInfoRegistry struct { mu sync.Mutex alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo } // getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one. // Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not. -func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) { +func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) { r.mu.Lock() defer r.mu.Unlock() @@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models. // get returns the channel for the specific alert rule // if the key does not exist returns an error -func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) { +func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) { r.mu.Lock() defer r.mu.Unlock() @@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) return info, nil } -func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool { +func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool { r.mu.Lock() defer r.mu.Unlock() @@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool { // del removes pair that has specific key from alertRuleInfo. // Returns 2-tuple where the first element is value of the removed pair // and the second element indicates whether element with the specified key existed. -func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) { +func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) { r.mu.Lock() defer r.mu.Unlock() info, ok := r.alertRuleInfo[key] @@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) return info, ok } -func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} { +func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} { r.mu.Lock() defer r.mu.Unlock() definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo)) @@ -111,3 +111,52 @@ type evaluation struct { scheduledAt time.Time version int64 } + +type schedulableAlertRulesRegistry struct { + rules map[models.AlertRuleKey]*models.SchedulableAlertRule + mu sync.Mutex +} + +// all returns all rules in the registry. +func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule { + r.mu.Lock() + defer r.mu.Unlock() + result := make([]*models.SchedulableAlertRule, 0, len(r.rules)) + for _, rule := range r.rules { + result = append(result, rule) + } + return result +} + +func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule { + r.mu.Lock() + defer r.mu.Unlock() + return r.rules[k] +} + +// set replaces all rules in the registry. +func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) { + r.mu.Lock() + defer r.mu.Unlock() + r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule) + for _, rule := range rules { + r.rules[rule.GetKey()] = rule + } +} + +// update inserts or replaces a rule in the registry. +func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) { + r.mu.Lock() + defer r.mu.Unlock() + r.rules[rule.GetKey()] = rule +} + +func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) { + r.mu.Lock() + defer r.mu.Unlock() + rule, ok := r.rules[k] + if ok { + delete(r.rules, k) + } + return rule, ok +} diff --git a/pkg/services/ngalert/schedule/registry_test.go b/pkg/services/ngalert/schedule/registry_test.go index f5a2eb991de..8f6ec53ccc7 100644 --- a/pkg/services/ngalert/schedule/registry_test.go +++ b/pkg/services/ngalert/schedule/registry_test.go @@ -8,7 +8,10 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/grafana/grafana/pkg/services/ngalert/models" ) func TestSchedule_alertRuleInfo(t *testing.T) { @@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) { wg.Wait() }) } + +func TestSchedulableAlertRulesRegistry(t *testing.T) { + r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)} + assert.Len(t, r.all(), 0) + + // replace all rules in the registry with foo + r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}}) + assert.Len(t, r.all(), 1) + foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}) + require.NotNil(t, foo) + assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo) + + // update foo to a newer version + r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}) + assert.Len(t, r.all(), 1) + foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}) + require.NotNil(t, foo) + assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo) + + // update bar which does not exist in the registry + r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}) + assert.Len(t, r.all(), 2) + foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}) + require.NotNil(t, foo) + assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo) + bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}) + require.NotNil(t, foo) + assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar) + + // replace all rules in the registry with baz + r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}}) + assert.Len(t, r.all(), 1) + baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}) + require.NotNil(t, baz) + assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz) + assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})) + assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})) + + // delete baz + deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"}) + assert.True(t, ok) + require.NotNil(t, deleted) + assert.Equal(t, *deleted, *baz) + assert.Len(t, r.all(), 0) + assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})) + + // baz cannot be deleted twice + deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"}) + assert.False(t, ok) + assert.Nil(t, deleted) +} diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index d31ac723eff..b991cfbd14b 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -54,7 +54,7 @@ type schedule struct { baseInterval time.Duration // each alert rule gets its own channel and routine - registry alertRuleRegistry + registry alertRuleInfoRegistry maxAttempts int64 @@ -97,6 +97,12 @@ type schedule struct { adminConfigPollInterval time.Duration disabledOrgs map[int64]struct{} minRuleInterval time.Duration + + // schedulableAlertRules contains the alert rules that are considered for + // evaluation in the current tick. The evaluation of an alert rule in the + // current tick depends on its evaluation interval and when it was + // last evaluated. + schedulableAlertRules schedulableAlertRulesRegistry } // SchedulerCfg is the scheduler configuration. @@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker) sch := schedule{ - registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)}, + registry: alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)}, maxAttempts: cfg.MaxAttempts, clock: cfg.C, baseInterval: cfg.BaseInterval, @@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url adminConfigPollInterval: cfg.AdminConfigPollInterval, disabledOrgs: cfg.DisabledOrgs, minRuleInterval: cfg.MinRuleInterval, + schedulableAlertRules: schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}, } return &sch } @@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) { // DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache. func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) { + // It can happen that the scheduler has deleted the alert rule before the + // Ruler API has called DeleteAlertRule. This can happen as requests to + // the Ruler API do not hold an exclusive lock over all scheduler operations. + if _, ok := sch.schedulableAlertRules.del(key); !ok { + sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID) + } + + // Delete the rule routine ruleInfo, ok := sch.registry.del(key) if !ok { - sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID) + sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID) return } // stop rule evaluation @@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error { disabledOrgs = append(disabledOrgs, disabledOrg) } - alertRules := sch.getAlertRules(ctx, disabledOrgs) + if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil { + sch.log.Error("scheduler failed to update alert rules", "err", err) + } + alertRules := sch.schedulableAlertRules.all() + sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs) // registeredDefinitions is a map used for finding deleted alert rules