mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Fix database unavailable removes rules from scheduler (#49874)
This commit is contained in:
parent
ae449cc823
commit
c83f84348c
@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
|
||||
|
||||
## Grafana Alerting - main / unreleased
|
||||
|
||||
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
|
||||
- [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
|
||||
- `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
|
||||
- `grafana_alerting_ticker_next_tick_timestamp_seconds`
|
||||
|
@ -46,14 +46,16 @@ type NGAlert struct {
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
Registerer prometheus.Registerer
|
||||
BehindSeconds prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
GetAlertRulesDuration prometheus.Histogram
|
||||
SchedulePeriodicDuration prometheus.Histogram
|
||||
Ticker *legacyMetrics.Ticker
|
||||
Registerer prometheus.Registerer
|
||||
BehindSeconds prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
SchedulePeriodicDuration prometheus.Histogram
|
||||
SchedulableAlertRules prometheus.Gauge
|
||||
SchedulableAlertRulesHash prometheus.Gauge
|
||||
UpdateSchedulableAlertRulesDuration prometheus.Histogram
|
||||
Ticker *legacyMetrics.Ticker
|
||||
}
|
||||
|
||||
type MultiOrgAlertmanager struct {
|
||||
@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
GetAlertRulesDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "get_alert_rules_duration_seconds",
|
||||
Help: "The time taken to get all alert rules.",
|
||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||
},
|
||||
),
|
||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||
},
|
||||
),
|
||||
SchedulableAlertRules: promauto.With(r).NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "schedule_alert_rules",
|
||||
Help: "The number of alert rules being considered for evaluation each tick.",
|
||||
},
|
||||
),
|
||||
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "schedule_alert_rules_hash",
|
||||
Help: "A hash of the alert rules over time.",
|
||||
}),
|
||||
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "schedule_query_alert_rules_duration_seconds",
|
||||
Help: "The time taken to fetch alert rules from the database.",
|
||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||
},
|
||||
),
|
||||
Ticker: legacyMetrics.NewTickerMetrics(r),
|
||||
}
|
||||
}
|
||||
|
@ -2,24 +2,55 @@ package schedule
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule {
|
||||
// hashUIDs returns a fnv64 hash of the UIDs for all alert rules.
|
||||
// The order of the alert rules does not matter as hashUIDs sorts
|
||||
// the UIDs in increasing order.
|
||||
func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 {
|
||||
h := fnv.New64()
|
||||
for _, uid := range sortedUIDs(alertRules) {
|
||||
// We can ignore err as fnv64 does not return an error
|
||||
// nolint:errcheck,gosec
|
||||
h.Write([]byte(uid))
|
||||
}
|
||||
return h.Sum64()
|
||||
}
|
||||
|
||||
// sortedUIDs returns a slice of sorted UIDs.
|
||||
func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string {
|
||||
uids := make([]string, 0, len(alertRules))
|
||||
for _, alertRule := range alertRules {
|
||||
uids = append(uids, alertRule.UID)
|
||||
}
|
||||
sort.Strings(uids)
|
||||
return uids
|
||||
}
|
||||
|
||||
// updateSchedulableAlertRules updates the alert rules for the scheduler.
|
||||
// It returns an error if the database is unavailable or the query returned
|
||||
// an error.
|
||||
func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error {
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
|
||||
sch.metrics.UpdateSchedulableAlertRulesDuration.Observe(
|
||||
time.Since(start).Seconds())
|
||||
}()
|
||||
|
||||
q := models.GetAlertRulesForSchedulingQuery{
|
||||
ExcludeOrgIDs: disabledOrgs,
|
||||
}
|
||||
err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q)
|
||||
if err != nil {
|
||||
sch.log.Error("failed to fetch alert definitions", "err", err)
|
||||
return nil
|
||||
if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil {
|
||||
return fmt.Errorf("failed to get alert rules: %w", err)
|
||||
}
|
||||
return q.Result
|
||||
sch.schedulableAlertRules.set(q.Result)
|
||||
sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
|
||||
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
|
||||
return nil
|
||||
}
|
||||
|
26
pkg/services/ngalert/schedule/fetcher_test.go
Normal file
26
pkg/services/ngalert/schedule/fetcher_test.go
Normal file
@ -0,0 +1,26 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
func TestHashUIDs(t *testing.T) {
|
||||
r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}}
|
||||
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
|
||||
// expect the same hash irrespective of order
|
||||
r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}}
|
||||
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
|
||||
// expect a different hash
|
||||
r = []*models.SchedulableAlertRule{{UID: "bar"}}
|
||||
assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r))
|
||||
// slice with no items
|
||||
r = []*models.SchedulableAlertRule{}
|
||||
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
|
||||
// a different slice with no items should have the same hash
|
||||
r = []*models.SchedulableAlertRule{}
|
||||
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
|
||||
}
|
@ -9,14 +9,14 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
type alertRuleRegistry struct {
|
||||
type alertRuleInfoRegistry struct {
|
||||
mu sync.Mutex
|
||||
alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
|
||||
}
|
||||
|
||||
// getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
|
||||
// Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
|
||||
func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||
func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.
|
||||
|
||||
// get returns the channel for the specific alert rule
|
||||
// if the key does not exist returns an error
|
||||
func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
|
||||
func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error)
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
|
||||
func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
|
||||
// del removes pair that has specific key from alertRuleInfo.
|
||||
// Returns 2-tuple where the first element is value of the removed pair
|
||||
// and the second element indicates whether element with the specified key existed.
|
||||
func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||
func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
info, ok := r.alertRuleInfo[key]
|
||||
@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool)
|
||||
return info, ok
|
||||
}
|
||||
|
||||
func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
|
||||
func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
|
||||
@ -111,3 +111,52 @@ type evaluation struct {
|
||||
scheduledAt time.Time
|
||||
version int64
|
||||
}
|
||||
|
||||
type schedulableAlertRulesRegistry struct {
|
||||
rules map[models.AlertRuleKey]*models.SchedulableAlertRule
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// all returns all rules in the registry.
|
||||
func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
result := make([]*models.SchedulableAlertRule, 0, len(r.rules))
|
||||
for _, rule := range r.rules {
|
||||
result = append(result, rule)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
return r.rules[k]
|
||||
}
|
||||
|
||||
// set replaces all rules in the registry.
|
||||
func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule)
|
||||
for _, rule := range rules {
|
||||
r.rules[rule.GetKey()] = rule
|
||||
}
|
||||
}
|
||||
|
||||
// update inserts or replaces a rule in the registry.
|
||||
func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
r.rules[rule.GetKey()] = rule
|
||||
}
|
||||
|
||||
func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
rule, ok := r.rules[k]
|
||||
if ok {
|
||||
delete(r.rules, k)
|
||||
}
|
||||
return rule, ok
|
||||
}
|
||||
|
@ -8,7 +8,10 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
func TestSchedule_alertRuleInfo(t *testing.T) {
|
||||
@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
|
||||
wg.Wait()
|
||||
})
|
||||
}
|
||||
|
||||
func TestSchedulableAlertRulesRegistry(t *testing.T) {
|
||||
r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}
|
||||
assert.Len(t, r.all(), 0)
|
||||
|
||||
// replace all rules in the registry with foo
|
||||
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}})
|
||||
assert.Len(t, r.all(), 1)
|
||||
foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||
require.NotNil(t, foo)
|
||||
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo)
|
||||
|
||||
// update foo to a newer version
|
||||
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2})
|
||||
assert.Len(t, r.all(), 1)
|
||||
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||
require.NotNil(t, foo)
|
||||
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
|
||||
|
||||
// update bar which does not exist in the registry
|
||||
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1})
|
||||
assert.Len(t, r.all(), 2)
|
||||
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||
require.NotNil(t, foo)
|
||||
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
|
||||
bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})
|
||||
require.NotNil(t, foo)
|
||||
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar)
|
||||
|
||||
// replace all rules in the registry with baz
|
||||
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}})
|
||||
assert.Len(t, r.all(), 1)
|
||||
baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||
require.NotNil(t, baz)
|
||||
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz)
|
||||
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}))
|
||||
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}))
|
||||
|
||||
// delete baz
|
||||
deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||
assert.True(t, ok)
|
||||
require.NotNil(t, deleted)
|
||||
assert.Equal(t, *deleted, *baz)
|
||||
assert.Len(t, r.all(), 0)
|
||||
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}))
|
||||
|
||||
// baz cannot be deleted twice
|
||||
deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||
assert.False(t, ok)
|
||||
assert.Nil(t, deleted)
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ type schedule struct {
|
||||
baseInterval time.Duration
|
||||
|
||||
// each alert rule gets its own channel and routine
|
||||
registry alertRuleRegistry
|
||||
registry alertRuleInfoRegistry
|
||||
|
||||
maxAttempts int64
|
||||
|
||||
@ -97,6 +97,12 @@ type schedule struct {
|
||||
adminConfigPollInterval time.Duration
|
||||
disabledOrgs map[int64]struct{}
|
||||
minRuleInterval time.Duration
|
||||
|
||||
// schedulableAlertRules contains the alert rules that are considered for
|
||||
// evaluation in the current tick. The evaluation of an alert rule in the
|
||||
// current tick depends on its evaluation interval and when it was
|
||||
// last evaluated.
|
||||
schedulableAlertRules schedulableAlertRulesRegistry
|
||||
}
|
||||
|
||||
// SchedulerCfg is the scheduler configuration.
|
||||
@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
|
||||
ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)
|
||||
|
||||
sch := schedule{
|
||||
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
|
||||
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
|
||||
maxAttempts: cfg.MaxAttempts,
|
||||
clock: cfg.C,
|
||||
baseInterval: cfg.BaseInterval,
|
||||
@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
|
||||
adminConfigPollInterval: cfg.AdminConfigPollInterval,
|
||||
disabledOrgs: cfg.DisabledOrgs,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
schedulableAlertRules: schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)},
|
||||
}
|
||||
return &sch
|
||||
}
|
||||
@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) {
|
||||
|
||||
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
||||
func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
|
||||
// It can happen that the scheduler has deleted the alert rule before the
|
||||
// Ruler API has called DeleteAlertRule. This can happen as requests to
|
||||
// the Ruler API do not hold an exclusive lock over all scheduler operations.
|
||||
if _, ok := sch.schedulableAlertRules.del(key); !ok {
|
||||
sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
|
||||
}
|
||||
|
||||
// Delete the rule routine
|
||||
ruleInfo, ok := sch.registry.del(key)
|
||||
if !ok {
|
||||
sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID)
|
||||
sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
|
||||
return
|
||||
}
|
||||
// stop rule evaluation
|
||||
@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
|
||||
disabledOrgs = append(disabledOrgs, disabledOrg)
|
||||
}
|
||||
|
||||
alertRules := sch.getAlertRules(ctx, disabledOrgs)
|
||||
if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil {
|
||||
sch.log.Error("scheduler failed to update alert rules", "err", err)
|
||||
}
|
||||
alertRules := sch.schedulableAlertRules.all()
|
||||
|
||||
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
|
||||
|
||||
// registeredDefinitions is a map used for finding deleted alert rules
|
||||
|
Loading…
Reference in New Issue
Block a user