Alerting: Fix database unavailable removes rules from scheduler (#49874)

2025-02-25 18:55:37 -06:00 · 2022-06-07 16:20:06 +01:00 · 2022-06-07 16:20:06 +01:00 · c83f84348c
commit c83f84348c
parent ae449cc823
7 changed files with 231 additions and 34 deletions
--- a/pkg/services/ngalert/CHANGELOG.md
+++ b/pkg/services/ngalert/CHANGELOG.md
@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
 ## Grafana Alerting - main / unreleased
 - [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
 - [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
  - `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
  - `grafana_alerting_ticker_next_tick_timestamp_seconds`
--- a/pkg/services/ngalert/metrics/ngalert.go
+++ b/pkg/services/ngalert/metrics/ngalert.go
@ -46,14 +46,16 @@ type NGAlert struct {
 }
 type Scheduler struct {
-	Registerer               prometheus.Registerer
+	Registerer                          prometheus.Registerer
-	BehindSeconds            prometheus.Gauge
+	BehindSeconds                       prometheus.Gauge
-	EvalTotal                *prometheus.CounterVec
+	EvalTotal                           *prometheus.CounterVec
-	EvalFailures             *prometheus.CounterVec
+	EvalFailures                        *prometheus.CounterVec
-	EvalDuration             *prometheus.SummaryVec
+	EvalDuration                        *prometheus.SummaryVec
-	GetAlertRulesDuration    prometheus.Histogram
+	SchedulePeriodicDuration            prometheus.Histogram
-	SchedulePeriodicDuration prometheus.Histogram
+	SchedulableAlertRules               prometheus.Gauge
-	Ticker                   *legacyMetrics.Ticker
+	SchedulableAlertRulesHash           prometheus.Gauge
 	UpdateSchedulableAlertRulesDuration prometheus.Histogram
 	Ticker                              *legacyMetrics.Ticker
 }
 type MultiOrgAlertmanager struct {
@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 			},
 			[]string{"org"},
 		),
 		GetAlertRulesDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "get_alert_rules_duration_seconds",
 				Help:      "The time taken to get all alert rules.",
 				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
 			},
 		),
 		SchedulePeriodicDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
 			},
 		),
 		SchedulableAlertRules: promauto.With(r).NewGauge(
 			prometheus.GaugeOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "schedule_alert_rules",
 				Help:      "The number of alert rules being considered for evaluation each tick.",
 			},
 		),
 		SchedulableAlertRulesHash: promauto.With(r).NewGauge(
 			prometheus.GaugeOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "schedule_alert_rules_hash",
 				Help:      "A hash of the alert rules over time.",
 			}),
 		UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "schedule_query_alert_rules_duration_seconds",
 				Help:      "The time taken to fetch alert rules from the database.",
 				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
 			},
 		),
 		Ticker: legacyMetrics.NewTickerMetrics(r),
 	}
 }
--- a/pkg/services/ngalert/schedule/fetcher.go
+++ b/pkg/services/ngalert/schedule/fetcher.go
@ -2,24 +2,55 @@ package schedule
 import (
 	"context"
 	"fmt"
 	"hash/fnv"
 	"sort"
 	"time"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )
-func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule {
+// hashUIDs returns a fnv64 hash of the UIDs for all alert rules.
 // The order of the alert rules does not matter as hashUIDs sorts
 // the UIDs in increasing order.
 func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 {
 	h := fnv.New64()
 	for _, uid := range sortedUIDs(alertRules) {
 		// We can ignore err as fnv64 does not return an error
 		// nolint:errcheck,gosec
 		h.Write([]byte(uid))
 	}
 	return h.Sum64()
 }
 // sortedUIDs returns a slice of sorted UIDs.
 func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string {
 	uids := make([]string, 0, len(alertRules))
 	for _, alertRule := range alertRules {
 		uids = append(uids, alertRule.UID)
 	}
 	sort.Strings(uids)
 	return uids
 }
 // updateSchedulableAlertRules updates the alert rules for the scheduler.
 // It returns an error if the database is unavailable or the query returned
 // an error.
 func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error {
 	start := time.Now()
 	defer func() {
-		sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
+		sch.metrics.UpdateSchedulableAlertRulesDuration.Observe(
 			time.Since(start).Seconds())
 	}()
 	q := models.GetAlertRulesForSchedulingQuery{
 		ExcludeOrgIDs: disabledOrgs,
 	}
-	err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q)
+	if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil {
-	if err != nil {
+		return fmt.Errorf("failed to get alert rules: %w", err)
 		sch.log.Error("failed to fetch alert definitions", "err", err)
 		return nil
 	}
-	return q.Result
+	sch.schedulableAlertRules.set(q.Result)
 	sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
 	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
 	return nil
 }
--- a/pkg/services/ngalert/schedule/fetcher_test.go
+++ b/pkg/services/ngalert/schedule/fetcher_test.go
@ -0,0 +1,26 @@
 package schedule
 import (
 	"testing"
 	"github.com/stretchr/testify/assert"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )
 func TestHashUIDs(t *testing.T) {
 	r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}}
 	assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
 	// expect the same hash irrespective of order
 	r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}}
 	assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
 	// expect a different hash
 	r = []*models.SchedulableAlertRule{{UID: "bar"}}
 	assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r))
 	// slice with no items
 	r = []*models.SchedulableAlertRule{}
 	assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
 	// a different slice with no items should have the same hash
 	r = []*models.SchedulableAlertRule{}
 	assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
 }
--- a/pkg/services/ngalert/schedule/registry.go
+++ b/pkg/services/ngalert/schedule/registry.go
@ -9,14 +9,14 @@ import (
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )
-type alertRuleRegistry struct {
+type alertRuleInfoRegistry struct {
 	mu            sync.Mutex
 	alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
 }
 // getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
 // Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
-func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
+func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.
 // get returns the channel for the specific alert rule
 // if the key does not exist returns an error
-func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
+func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error)
 	return info, nil
 }
-func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
+func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
 	r.mu.Lock()
 	defer r.mu.Unlock()
@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
 // del removes pair that has specific key from alertRuleInfo.
 // Returns 2-tuple where the first element is value of the removed pair
 // and the second element indicates whether element with the specified key existed.
-func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
+func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	info, ok := r.alertRuleInfo[key]
@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool)
 	return info, ok
 }
-func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
+func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
@ -111,3 +111,52 @@ type evaluation struct {
 	scheduledAt time.Time
 	version     int64
 }
 type schedulableAlertRulesRegistry struct {
 	rules map[models.AlertRuleKey]*models.SchedulableAlertRule
 	mu    sync.Mutex
 }
 // all returns all rules in the registry.
 func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	result := make([]*models.SchedulableAlertRule, 0, len(r.rules))
 	for _, rule := range r.rules {
 		result = append(result, rule)
 	}
 	return result
 }
 func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	return r.rules[k]
 }
 // set replaces all rules in the registry.
 func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule)
 	for _, rule := range rules {
 		r.rules[rule.GetKey()] = rule
 	}
 }
 // update inserts or replaces a rule in the registry.
 func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	r.rules[rule.GetKey()] = rule
 }
 func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	rule, ok := r.rules[k]
 	if ok {
 		delete(r.rules, k)
 	}
 	return rule, ok
 }
--- a/pkg/services/ngalert/schedule/registry_test.go
+++ b/pkg/services/ngalert/schedule/registry_test.go
@ -8,7 +8,10 @@ import (
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 )
 func TestSchedule_alertRuleInfo(t *testing.T) {
@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 		wg.Wait()
 	})
 }
 func TestSchedulableAlertRulesRegistry(t *testing.T) {
 	r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}
 	assert.Len(t, r.all(), 0)
 	// replace all rules in the registry with foo
 	r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}})
 	assert.Len(t, r.all(), 1)
 	foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
 	require.NotNil(t, foo)
 	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo)
 	// update foo to a newer version
 	r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2})
 	assert.Len(t, r.all(), 1)
 	foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
 	require.NotNil(t, foo)
 	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
 	// update bar which does not exist in the registry
 	r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1})
 	assert.Len(t, r.all(), 2)
 	foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
 	require.NotNil(t, foo)
 	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
 	bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})
 	require.NotNil(t, foo)
 	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar)
 	// replace all rules in the registry with baz
 	r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}})
 	assert.Len(t, r.all(), 1)
 	baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})
 	require.NotNil(t, baz)
 	assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz)
 	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}))
 	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}))
 	// delete baz
 	deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
 	assert.True(t, ok)
 	require.NotNil(t, deleted)
 	assert.Equal(t, *deleted, *baz)
 	assert.Len(t, r.all(), 0)
 	assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}))
 	// baz cannot be deleted twice
 	deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
 	assert.False(t, ok)
 	assert.Nil(t, deleted)
 }
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -54,7 +54,7 @@ type schedule struct {
 	baseInterval time.Duration
 	// each alert rule gets its own channel and routine
-	registry alertRuleRegistry
+	registry alertRuleInfoRegistry
 	maxAttempts int64
@ -97,6 +97,12 @@ type schedule struct {
 	adminConfigPollInterval time.Duration
 	disabledOrgs            map[int64]struct{}
 	minRuleInterval         time.Duration
 	// schedulableAlertRules contains the alert rules that are considered for
 	// evaluation in the current tick. The evaluation of an alert rule in the
 	// current tick depends on its evaluation interval and when it was
 	// last evaluated.
 	schedulableAlertRules schedulableAlertRulesRegistry
 }
 // SchedulerCfg is the scheduler configuration.
@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
 	ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)
 	sch := schedule{
-		registry:                alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
+		registry:                alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
 		maxAttempts:             cfg.MaxAttempts,
 		clock:                   cfg.C,
 		baseInterval:            cfg.BaseInterval,
@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
 		adminConfigPollInterval: cfg.AdminConfigPollInterval,
 		disabledOrgs:            cfg.DisabledOrgs,
 		minRuleInterval:         cfg.MinRuleInterval,
 		schedulableAlertRules:   schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)},
 	}
 	return &sch
 }
@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) {
 // DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
 func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
 	// It can happen that the scheduler has deleted the alert rule before the
 	// Ruler API has called DeleteAlertRule. This can happen as requests to
 	// the Ruler API do not hold an exclusive lock over all scheduler operations.
 	if _, ok := sch.schedulableAlertRules.del(key); !ok {
 		sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
 	}
 	// Delete the rule routine
 	ruleInfo, ok := sch.registry.del(key)
 	if !ok {
-		sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID)
+		sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
 		return
 	}
 	// stop rule evaluation
@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
 				disabledOrgs = append(disabledOrgs, disabledOrg)
 			}
-			alertRules := sch.getAlertRules(ctx, disabledOrgs)
+			if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil {
 				sch.log.Error("scheduler failed to update alert rules", "err", err)
 			}
 			alertRules := sch.schedulableAlertRules.all()
 			sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
 			// registeredDefinitions is a map used for finding deleted alert rules