Alerting: Fix database unavailable removes rules from scheduler (#49874)

2025-02-25 18:55:37 -06:00 · 2022-06-07 16:20:06 +01:00 · 2022-06-07 16:20:06 +01:00 · c83f84348c
commit c83f84348c
parent ae449cc823
7 changed files with 231 additions and 34 deletions
--- a/pkg/services/ngalert/CHANGELOG.md
+++ b/pkg/services/ngalert/CHANGELOG.md
@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u

 ## Grafana Alerting - main / unreleased

+- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
 - [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
  - `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
  - `grafana_alerting_ticker_next_tick_timestamp_seconds`
--- a/pkg/services/ngalert/metrics/ngalert.go
+++ b/pkg/services/ngalert/metrics/ngalert.go
@ -46,14 +46,16 @@ type NGAlert struct {
 }

 type Scheduler struct {
-	Registerer               prometheus.Registerer
-	BehindSeconds            prometheus.Gauge
-	EvalTotal                *prometheus.CounterVec
-	EvalFailures             *prometheus.CounterVec
-	EvalDuration             *prometheus.SummaryVec
-	GetAlertRulesDuration    prometheus.Histogram
-	SchedulePeriodicDuration prometheus.Histogram
-	Ticker                   *legacyMetrics.Ticker
+	Registerer                          prometheus.Registerer
+	BehindSeconds                       prometheus.Gauge
+	EvalTotal                           *prometheus.CounterVec
+	EvalFailures                        *prometheus.CounterVec
+	EvalDuration                        *prometheus.SummaryVec
+	SchedulePeriodicDuration            prometheus.Histogram
+	SchedulableAlertRules               prometheus.Gauge
+	SchedulableAlertRulesHash           prometheus.Gauge
+	UpdateSchedulableAlertRulesDuration prometheus.Histogram
+	Ticker                              *legacyMetrics.Ticker
 }

 type MultiOrgAlertmanager struct {
@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 			},
 			[]string{"org"},
 		),
-		GetAlertRulesDuration: promauto.With(r).NewHistogram(
-			prometheus.HistogramOpts{
-				Namespace: Namespace,
-				Subsystem: Subsystem,
-				Name:      "get_alert_rules_duration_seconds",
-				Help:      "The time taken to get all alert rules.",
-				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
-			},
-		),
 		SchedulePeriodicDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
 			},
 		),
+		SchedulableAlertRules: promauto.With(r).NewGauge(
+			prometheus.GaugeOpts{
+				Namespace: Namespace,
+				Subsystem: Subsystem,
+				Name:      "schedule_alert_rules",
+				Help:      "The number of alert rules being considered for evaluation each tick.",
+			},
+		),
+		SchedulableAlertRulesHash: promauto.With(r).NewGauge(
+			prometheus.GaugeOpts{
+				Namespace: Namespace,
+				Subsystem: Subsystem,
+				Name:      "schedule_alert_rules_hash",
+				Help:      "A hash of the alert rules over time.",
+			}),
+		UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
+			prometheus.HistogramOpts{
+				Namespace: Namespace,
+				Subsystem: Subsystem,
+				Name:      "schedule_query_alert_rules_duration_seconds",
+				Help:      "The time taken to fetch alert rules from the database.",
+				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
+			},
+		),
 		Ticker: legacyMetrics.NewTickerMetrics(r),
 	}
 }
--- a/pkg/services/ngalert/schedule/fetcher.go
+++ b/pkg/services/ngalert/schedule/fetcher.go
@ -2,24 +2,55 @@ package schedule

 import (
 	"context"
+	"fmt"
+	"hash/fnv"
+	"sort"
 	"time"

 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )

-func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule {
+// hashUIDs returns a fnv64 hash of the UIDs for all alert rules.
+// The order of the alert rules does not matter as hashUIDs sorts
+// the UIDs in increasing order.
+func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 {
+	h := fnv.New64()
+	for _, uid := range sortedUIDs(alertRules) {
+		// We can ignore err as fnv64 does not return an error
+		// nolint:errcheck,gosec
+		h.Write([]byte(uid))
+	}
+	return h.Sum64()
+}
+
+// sortedUIDs returns a slice of sorted UIDs.
+func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string {
+	uids := make([]string, 0, len(alertRules))
+	for _, alertRule := range alertRules {
+		uids = append(uids, alertRule.UID)
+	}
+	sort.Strings(uids)
+	return uids
+}
+
+// updateSchedulableAlertRules updates the alert rules for the scheduler.
+// It returns an error if the database is unavailable or the query returned
+// an error.
+func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error {
 	start := time.Now()
 	defer func() {
-		sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
+		sch.metrics.UpdateSchedulableAlertRulesDuration.Observe(
+			time.Since(start).Seconds())
 	}()

 	q := models.GetAlertRulesForSchedulingQuery{
 		ExcludeOrgIDs: disabledOrgs,
 	}
-	err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q)
-	if err != nil {
-		sch.log.Error("failed to fetch alert definitions", "err", err)
-		return nil
+	if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil {
+		return fmt.Errorf("failed to get alert rules: %w", err)
 	}
-	return q.Result
+	sch.schedulableAlertRules.set(q.Result)
+	sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
+	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
+	return nil
 }
--- a/pkg/services/ngalert/schedule/fetcher_test.go
+++ b/pkg/services/ngalert/schedule/fetcher_test.go
@ -0,0 +1,26 @@
+package schedule
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/grafana/grafana/pkg/services/ngalert/models"
+)
+
+func TestHashUIDs(t *testing.T) {
+	r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}}
+	assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
+	// expect the same hash irrespective of order
+	r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}}
+	assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
+	// expect a different hash
+	r = []*models.SchedulableAlertRule{{UID: "bar"}}
+	assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r))
+	// slice with no items
+	r = []*models.SchedulableAlertRule{}
+	assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
+	// a different slice with no items should have the same hash
+	r = []*models.SchedulableAlertRule{}
+	assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
+}
--- a/pkg/services/ngalert/schedule/registry.go
+++ b/pkg/services/ngalert/schedule/registry.go
@ -9,14 +9,14 @@ import (
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )

-type alertRuleRegistry struct {
+type alertRuleInfoRegistry struct {
 	mu            sync.Mutex
 	alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
 }

 // getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
 // Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
-func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
+func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()

@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.

 // get returns the channel for the specific alert rule
 // if the key does not exist returns an error
-func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
+func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()

@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error)
 	return info, nil
 }

-func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
+func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
 	r.mu.Lock()
 	defer r.mu.Unlock()

@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
 // del removes pair that has specific key from alertRuleInfo.
 // Returns 2-tuple where the first element is value of the removed pair
 // and the second element indicates whether element with the specified key existed.
-func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
+func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	info, ok := r.alertRuleInfo[key]
@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool)
 	return info, ok
 }

-func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
+func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
@ -111,3 +111,52 @@ type evaluation struct {
 	scheduledAt time.Time
 	version     int64
 }
+
+type schedulableAlertRulesRegistry struct {
+	rules map[models.AlertRuleKey]*models.SchedulableAlertRule
+	mu    sync.Mutex
+}
+
+// all returns all rules in the registry.
+func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	result := make([]*models.SchedulableAlertRule, 0, len(r.rules))
+	for _, rule := range r.rules {
+		result = append(result, rule)
+	}
+	return result
+}
+
+func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.rules[k]
+}
+
+// set replaces all rules in the registry.
+func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule)
+	for _, rule := range rules {
+		r.rules[rule.GetKey()] = rule
+	}
+}
+
+// update inserts or replaces a rule in the registry.
+func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.rules[rule.GetKey()] = rule
+}
+
+func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	rule, ok := r.rules[k]
+	if ok {
+		delete(r.rules, k)
+	}
+	return rule, ok
+}
--- a/pkg/services/ngalert/schedule/registry_test.go
+++ b/pkg/services/ngalert/schedule/registry_test.go
@ -8,7 +8,10 @@ import (
 	"testing"
 	"time"

+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
+	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )

 func TestSchedule_alertRuleInfo(t *testing.T) {
@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 		wg.Wait()
 	})
 }
+
+func TestSchedulableAlertRulesRegistry(t *testing.T) {
+	r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}
+	assert.Len(t, r.all(), 0)
+
+	// replace all rules in the registry with foo
+	r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}})
+	assert.Len(t, r.all(), 1)
+	foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
+	require.NotNil(t, foo)
+	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo)
+
+	// update foo to a newer version
+	r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2})
+	assert.Len(t, r.all(), 1)
+	foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
+	require.NotNil(t, foo)
+	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
+
+	// update bar which does not exist in the registry
+	r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1})
+	assert.Len(t, r.all(), 2)
+	foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
+	require.NotNil(t, foo)
+	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
+	bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})
+	require.NotNil(t, foo)
+	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar)
+
+	// replace all rules in the registry with baz
+	r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}})
+	assert.Len(t, r.all(), 1)
+	baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})
+	require.NotNil(t, baz)
+	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz)
+	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}))
+	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}))
+
+	// delete baz
+	deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
+	assert.True(t, ok)
+	require.NotNil(t, deleted)
+	assert.Equal(t, *deleted, *baz)
+	assert.Len(t, r.all(), 0)
+	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}))
+
+	// baz cannot be deleted twice
+	deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
+	assert.False(t, ok)
+	assert.Nil(t, deleted)
+}
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -54,7 +54,7 @@ type schedule struct {
 	baseInterval time.Duration

 	// each alert rule gets its own channel and routine
-	registry alertRuleRegistry
+	registry alertRuleInfoRegistry

 	maxAttempts int64

@ -97,6 +97,12 @@ type schedule struct {
 	adminConfigPollInterval time.Duration
 	disabledOrgs            map[int64]struct{}
 	minRuleInterval         time.Duration
+
+	// schedulableAlertRules contains the alert rules that are considered for
+	// evaluation in the current tick. The evaluation of an alert rule in the
+	// current tick depends on its evaluation interval and when it was
+	// last evaluated.
+	schedulableAlertRules schedulableAlertRulesRegistry
 }

 // SchedulerCfg is the scheduler configuration.
@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
 	ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)

 	sch := schedule{
-		registry:                alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
+		registry:                alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
 		maxAttempts:             cfg.MaxAttempts,
 		clock:                   cfg.C,
 		baseInterval:            cfg.BaseInterval,
@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
 		adminConfigPollInterval: cfg.AdminConfigPollInterval,
 		disabledOrgs:            cfg.DisabledOrgs,
 		minRuleInterval:         cfg.MinRuleInterval,
+		schedulableAlertRules:   schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)},
 	}
 	return &sch
 }
@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) {

 // DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
 func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
+	// It can happen that the scheduler has deleted the alert rule before the
+	// Ruler API has called DeleteAlertRule. This can happen as requests to
+	// the Ruler API do not hold an exclusive lock over all scheduler operations.
+	if _, ok := sch.schedulableAlertRules.del(key); !ok {
+		sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
+	}
+
+	// Delete the rule routine
 	ruleInfo, ok := sch.registry.del(key)
 	if !ok {
-		sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID)
+		sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
 		return
 	}
 	// stop rule evaluation
@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
 				disabledOrgs = append(disabledOrgs, disabledOrg)
 			}

-			alertRules := sch.getAlertRules(ctx, disabledOrgs)
+			if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil {
+				sch.log.Error("scheduler failed to update alert rules", "err", err)
+			}
+			alertRules := sch.schedulableAlertRules.all()
+
 			sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)

 			// registeredDefinitions is a map used for finding deleted alert rules