Alerting: Fix database unavailable removes rules from scheduler (#49874)

This commit is contained in:
George Robinson 2022-06-07 16:20:06 +01:00 committed by GitHub
parent ae449cc823
commit c83f84348c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 231 additions and 34 deletions

View File

@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
## Grafana Alerting - main / unreleased
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
- [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
- `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
- `grafana_alerting_ticker_next_tick_timestamp_seconds`

View File

@ -46,14 +46,16 @@ type NGAlert struct {
}
type Scheduler struct {
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
GetAlertRulesDuration prometheus.Histogram
SchedulePeriodicDuration prometheus.Histogram
Ticker *legacyMetrics.Ticker
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
UpdateSchedulableAlertRulesDuration prometheus.Histogram
Ticker *legacyMetrics.Ticker
}
type MultiOrgAlertmanager struct {
@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
GetAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "get_alert_rules_duration_seconds",
Help: "The time taken to get all alert rules.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulableAlertRules: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules",
Help: "The number of alert rules being considered for evaluation each tick.",
},
),
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules_hash",
Help: "A hash of the alert rules over time.",
}),
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_query_alert_rules_duration_seconds",
Help: "The time taken to fetch alert rules from the database.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
Ticker: legacyMetrics.NewTickerMetrics(r),
}
}

View File

@ -2,24 +2,55 @@ package schedule
import (
"context"
"fmt"
"hash/fnv"
"sort"
"time"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule {
// hashUIDs returns a fnv64 hash of the UIDs for all alert rules.
// The order of the alert rules does not matter as hashUIDs sorts
// the UIDs in increasing order.
func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 {
h := fnv.New64()
for _, uid := range sortedUIDs(alertRules) {
// We can ignore err as fnv64 does not return an error
// nolint:errcheck,gosec
h.Write([]byte(uid))
}
return h.Sum64()
}
// sortedUIDs returns a slice of sorted UIDs.
func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string {
uids := make([]string, 0, len(alertRules))
for _, alertRule := range alertRules {
uids = append(uids, alertRule.UID)
}
sort.Strings(uids)
return uids
}
// updateSchedulableAlertRules updates the alert rules for the scheduler.
// It returns an error if the database is unavailable or the query returned
// an error.
func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error {
start := time.Now()
defer func() {
sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
sch.metrics.UpdateSchedulableAlertRulesDuration.Observe(
time.Since(start).Seconds())
}()
q := models.GetAlertRulesForSchedulingQuery{
ExcludeOrgIDs: disabledOrgs,
}
err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q)
if err != nil {
sch.log.Error("failed to fetch alert definitions", "err", err)
return nil
if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil {
return fmt.Errorf("failed to get alert rules: %w", err)
}
return q.Result
sch.schedulableAlertRules.set(q.Result)
sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
return nil
}

View File

@ -0,0 +1,26 @@
package schedule
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func TestHashUIDs(t *testing.T) {
r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}}
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
// expect the same hash irrespective of order
r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}}
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
// expect a different hash
r = []*models.SchedulableAlertRule{{UID: "bar"}}
assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r))
// slice with no items
r = []*models.SchedulableAlertRule{}
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
// a different slice with no items should have the same hash
r = []*models.SchedulableAlertRule{}
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
}

View File

@ -9,14 +9,14 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
type alertRuleRegistry struct {
type alertRuleInfoRegistry struct {
mu sync.Mutex
alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
}
// getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
// Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
r.mu.Lock()
defer r.mu.Unlock()
@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.
// get returns the channel for the specific alert rule
// if the key does not exist returns an error
func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
r.mu.Lock()
defer r.mu.Unlock()
@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error)
return info, nil
}
func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
r.mu.Lock()
defer r.mu.Unlock()
@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
// del removes pair that has specific key from alertRuleInfo.
// Returns 2-tuple where the first element is value of the removed pair
// and the second element indicates whether element with the specified key existed.
func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
r.mu.Lock()
defer r.mu.Unlock()
info, ok := r.alertRuleInfo[key]
@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool)
return info, ok
}
func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
r.mu.Lock()
defer r.mu.Unlock()
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
@ -111,3 +111,52 @@ type evaluation struct {
scheduledAt time.Time
version int64
}
type schedulableAlertRulesRegistry struct {
rules map[models.AlertRuleKey]*models.SchedulableAlertRule
mu sync.Mutex
}
// all returns all rules in the registry.
func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule {
r.mu.Lock()
defer r.mu.Unlock()
result := make([]*models.SchedulableAlertRule, 0, len(r.rules))
for _, rule := range r.rules {
result = append(result, rule)
}
return result
}
func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule {
r.mu.Lock()
defer r.mu.Unlock()
return r.rules[k]
}
// set replaces all rules in the registry.
func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) {
r.mu.Lock()
defer r.mu.Unlock()
r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule)
for _, rule := range rules {
r.rules[rule.GetKey()] = rule
}
}
// update inserts or replaces a rule in the registry.
func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) {
r.mu.Lock()
defer r.mu.Unlock()
r.rules[rule.GetKey()] = rule
}
func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
r.mu.Lock()
defer r.mu.Unlock()
rule, ok := r.rules[k]
if ok {
delete(r.rules, k)
}
return rule, ok
}

View File

@ -8,7 +8,10 @@ import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func TestSchedule_alertRuleInfo(t *testing.T) {
@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
wg.Wait()
})
}
func TestSchedulableAlertRulesRegistry(t *testing.T) {
r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}
assert.Len(t, r.all(), 0)
// replace all rules in the registry with foo
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}})
assert.Len(t, r.all(), 1)
foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
require.NotNil(t, foo)
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo)
// update foo to a newer version
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2})
assert.Len(t, r.all(), 1)
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
require.NotNil(t, foo)
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
// update bar which does not exist in the registry
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1})
assert.Len(t, r.all(), 2)
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
require.NotNil(t, foo)
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})
require.NotNil(t, foo)
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar)
// replace all rules in the registry with baz
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}})
assert.Len(t, r.all(), 1)
baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})
require.NotNil(t, baz)
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz)
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}))
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}))
// delete baz
deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
assert.True(t, ok)
require.NotNil(t, deleted)
assert.Equal(t, *deleted, *baz)
assert.Len(t, r.all(), 0)
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}))
// baz cannot be deleted twice
deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
assert.False(t, ok)
assert.Nil(t, deleted)
}

View File

@ -54,7 +54,7 @@ type schedule struct {
baseInterval time.Duration
// each alert rule gets its own channel and routine
registry alertRuleRegistry
registry alertRuleInfoRegistry
maxAttempts int64
@ -97,6 +97,12 @@ type schedule struct {
adminConfigPollInterval time.Duration
disabledOrgs map[int64]struct{}
minRuleInterval time.Duration
// schedulableAlertRules contains the alert rules that are considered for
// evaluation in the current tick. The evaluation of an alert rule in the
// current tick depends on its evaluation interval and when it was
// last evaluated.
schedulableAlertRules schedulableAlertRulesRegistry
}
// SchedulerCfg is the scheduler configuration.
@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)
sch := schedule{
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
baseInterval: cfg.BaseInterval,
@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
adminConfigPollInterval: cfg.AdminConfigPollInterval,
disabledOrgs: cfg.DisabledOrgs,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)},
}
return &sch
}
@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) {
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
// It can happen that the scheduler has deleted the alert rule before the
// Ruler API has called DeleteAlertRule. This can happen as requests to
// the Ruler API do not hold an exclusive lock over all scheduler operations.
if _, ok := sch.schedulableAlertRules.del(key); !ok {
sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
}
// Delete the rule routine
ruleInfo, ok := sch.registry.del(key)
if !ok {
sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID)
sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
return
}
// stop rule evaluation
@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
disabledOrgs = append(disabledOrgs, disabledOrg)
}
alertRules := sch.getAlertRules(ctx, disabledOrgs)
if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil {
sch.log.Error("scheduler failed to update alert rules", "err", err)
}
alertRules := sch.schedulableAlertRules.all()
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
// registeredDefinitions is a map used for finding deleted alert rules