mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Fix database unavailable removes rules from scheduler (#49874)
This commit is contained in:
parent
ae449cc823
commit
c83f84348c
@ -45,6 +45,7 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
|
|||||||
|
|
||||||
## Grafana Alerting - main / unreleased
|
## Grafana Alerting - main / unreleased
|
||||||
|
|
||||||
|
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
|
||||||
- [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
|
- [ENHANCEMENT] Scheduler: Ticker expose new metrics. In legacy, metrics are prefixed with `legacy_` #47828, #48190
|
||||||
- `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
|
- `grafana_alerting_ticker_last_consumed_tick_timestamp_seconds`
|
||||||
- `grafana_alerting_ticker_next_tick_timestamp_seconds`
|
- `grafana_alerting_ticker_next_tick_timestamp_seconds`
|
||||||
|
@ -46,14 +46,16 @@ type NGAlert struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Scheduler struct {
|
type Scheduler struct {
|
||||||
Registerer prometheus.Registerer
|
Registerer prometheus.Registerer
|
||||||
BehindSeconds prometheus.Gauge
|
BehindSeconds prometheus.Gauge
|
||||||
EvalTotal *prometheus.CounterVec
|
EvalTotal *prometheus.CounterVec
|
||||||
EvalFailures *prometheus.CounterVec
|
EvalFailures *prometheus.CounterVec
|
||||||
EvalDuration *prometheus.SummaryVec
|
EvalDuration *prometheus.SummaryVec
|
||||||
GetAlertRulesDuration prometheus.Histogram
|
SchedulePeriodicDuration prometheus.Histogram
|
||||||
SchedulePeriodicDuration prometheus.Histogram
|
SchedulableAlertRules prometheus.Gauge
|
||||||
Ticker *legacyMetrics.Ticker
|
SchedulableAlertRulesHash prometheus.Gauge
|
||||||
|
UpdateSchedulableAlertRulesDuration prometheus.Histogram
|
||||||
|
Ticker *legacyMetrics.Ticker
|
||||||
}
|
}
|
||||||
|
|
||||||
type MultiOrgAlertmanager struct {
|
type MultiOrgAlertmanager struct {
|
||||||
@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|||||||
},
|
},
|
||||||
[]string{"org"},
|
[]string{"org"},
|
||||||
),
|
),
|
||||||
GetAlertRulesDuration: promauto.With(r).NewHistogram(
|
|
||||||
prometheus.HistogramOpts{
|
|
||||||
Namespace: Namespace,
|
|
||||||
Subsystem: Subsystem,
|
|
||||||
Name: "get_alert_rules_duration_seconds",
|
|
||||||
Help: "The time taken to get all alert rules.",
|
|
||||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
|
||||||
},
|
|
||||||
),
|
|
||||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|||||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
SchedulableAlertRules: promauto.With(r).NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_alert_rules",
|
||||||
|
Help: "The number of alert rules being considered for evaluation each tick.",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_alert_rules_hash",
|
||||||
|
Help: "A hash of the alert rules over time.",
|
||||||
|
}),
|
||||||
|
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "schedule_query_alert_rules_duration_seconds",
|
||||||
|
Help: "The time taken to fetch alert rules from the database.",
|
||||||
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||||
|
},
|
||||||
|
),
|
||||||
Ticker: legacyMetrics.NewTickerMetrics(r),
|
Ticker: legacyMetrics.NewTickerMetrics(r),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,24 +2,55 @@ package schedule
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"hash/fnv"
|
||||||
|
"sort"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (sch *schedule) getAlertRules(ctx context.Context, disabledOrgs []int64) []*models.SchedulableAlertRule {
|
// hashUIDs returns a fnv64 hash of the UIDs for all alert rules.
|
||||||
|
// The order of the alert rules does not matter as hashUIDs sorts
|
||||||
|
// the UIDs in increasing order.
|
||||||
|
func hashUIDs(alertRules []*models.SchedulableAlertRule) uint64 {
|
||||||
|
h := fnv.New64()
|
||||||
|
for _, uid := range sortedUIDs(alertRules) {
|
||||||
|
// We can ignore err as fnv64 does not return an error
|
||||||
|
// nolint:errcheck,gosec
|
||||||
|
h.Write([]byte(uid))
|
||||||
|
}
|
||||||
|
return h.Sum64()
|
||||||
|
}
|
||||||
|
|
||||||
|
// sortedUIDs returns a slice of sorted UIDs.
|
||||||
|
func sortedUIDs(alertRules []*models.SchedulableAlertRule) []string {
|
||||||
|
uids := make([]string, 0, len(alertRules))
|
||||||
|
for _, alertRule := range alertRules {
|
||||||
|
uids = append(uids, alertRule.UID)
|
||||||
|
}
|
||||||
|
sort.Strings(uids)
|
||||||
|
return uids
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateSchedulableAlertRules updates the alert rules for the scheduler.
|
||||||
|
// It returns an error if the database is unavailable or the query returned
|
||||||
|
// an error.
|
||||||
|
func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOrgs []int64) error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
defer func() {
|
defer func() {
|
||||||
sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
|
sch.metrics.UpdateSchedulableAlertRulesDuration.Observe(
|
||||||
|
time.Since(start).Seconds())
|
||||||
}()
|
}()
|
||||||
|
|
||||||
q := models.GetAlertRulesForSchedulingQuery{
|
q := models.GetAlertRulesForSchedulingQuery{
|
||||||
ExcludeOrgIDs: disabledOrgs,
|
ExcludeOrgIDs: disabledOrgs,
|
||||||
}
|
}
|
||||||
err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q)
|
if err := sch.ruleStore.GetAlertRulesForScheduling(ctx, &q); err != nil {
|
||||||
if err != nil {
|
return fmt.Errorf("failed to get alert rules: %w", err)
|
||||||
sch.log.Error("failed to fetch alert definitions", "err", err)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
return q.Result
|
sch.schedulableAlertRules.set(q.Result)
|
||||||
|
sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
|
||||||
|
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
26
pkg/services/ngalert/schedule/fetcher_test.go
Normal file
26
pkg/services/ngalert/schedule/fetcher_test.go
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
package schedule
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHashUIDs(t *testing.T) {
|
||||||
|
r := []*models.SchedulableAlertRule{{UID: "foo"}, {UID: "bar"}}
|
||||||
|
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
|
||||||
|
// expect the same hash irrespective of order
|
||||||
|
r = []*models.SchedulableAlertRule{{UID: "bar"}, {UID: "foo"}}
|
||||||
|
assert.Equal(t, uint64(0xade76f55c76a1c48), hashUIDs(r))
|
||||||
|
// expect a different hash
|
||||||
|
r = []*models.SchedulableAlertRule{{UID: "bar"}}
|
||||||
|
assert.Equal(t, uint64(0xd8d9a5186bad3880), hashUIDs(r))
|
||||||
|
// slice with no items
|
||||||
|
r = []*models.SchedulableAlertRule{}
|
||||||
|
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
|
||||||
|
// a different slice with no items should have the same hash
|
||||||
|
r = []*models.SchedulableAlertRule{}
|
||||||
|
assert.Equal(t, uint64(0xcbf29ce484222325), hashUIDs(r))
|
||||||
|
}
|
@ -9,14 +9,14 @@ import (
|
|||||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
type alertRuleRegistry struct {
|
type alertRuleInfoRegistry struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
|
alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
// getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
|
// getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
|
||||||
// Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
|
// Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
|
||||||
func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.
|
|||||||
|
|
||||||
// get returns the channel for the specific alert rule
|
// get returns the channel for the specific alert rule
|
||||||
// if the key does not exist returns an error
|
// if the key does not exist returns an error
|
||||||
func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
|
func (r *alertRuleInfoRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error)
|
|||||||
return info, nil
|
return info, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
|
func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
|
|
||||||
@ -52,7 +52,7 @@ func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
|
|||||||
// del removes pair that has specific key from alertRuleInfo.
|
// del removes pair that has specific key from alertRuleInfo.
|
||||||
// Returns 2-tuple where the first element is value of the removed pair
|
// Returns 2-tuple where the first element is value of the removed pair
|
||||||
// and the second element indicates whether element with the specified key existed.
|
// and the second element indicates whether element with the specified key existed.
|
||||||
func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
info, ok := r.alertRuleInfo[key]
|
info, ok := r.alertRuleInfo[key]
|
||||||
@ -62,7 +62,7 @@ func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool)
|
|||||||
return info, ok
|
return info, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
|
func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
|
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
|
||||||
@ -111,3 +111,52 @@ type evaluation struct {
|
|||||||
scheduledAt time.Time
|
scheduledAt time.Time
|
||||||
version int64
|
version int64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type schedulableAlertRulesRegistry struct {
|
||||||
|
rules map[models.AlertRuleKey]*models.SchedulableAlertRule
|
||||||
|
mu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// all returns all rules in the registry.
|
||||||
|
func (r *schedulableAlertRulesRegistry) all() []*models.SchedulableAlertRule {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
result := make([]*models.SchedulableAlertRule, 0, len(r.rules))
|
||||||
|
for _, rule := range r.rules {
|
||||||
|
result = append(result, rule)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *schedulableAlertRulesRegistry) get(k models.AlertRuleKey) *models.SchedulableAlertRule {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
return r.rules[k]
|
||||||
|
}
|
||||||
|
|
||||||
|
// set replaces all rules in the registry.
|
||||||
|
func (r *schedulableAlertRulesRegistry) set(rules []*models.SchedulableAlertRule) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
r.rules = make(map[models.AlertRuleKey]*models.SchedulableAlertRule)
|
||||||
|
for _, rule := range rules {
|
||||||
|
r.rules[rule.GetKey()] = rule
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update inserts or replaces a rule in the registry.
|
||||||
|
func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
r.rules[rule.GetKey()] = rule
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
rule, ok := r.rules[k]
|
||||||
|
if ok {
|
||||||
|
delete(r.rules, k)
|
||||||
|
}
|
||||||
|
return rule, ok
|
||||||
|
}
|
||||||
|
@ -8,7 +8,10 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestSchedule_alertRuleInfo(t *testing.T) {
|
func TestSchedule_alertRuleInfo(t *testing.T) {
|
||||||
@ -116,3 +119,54 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
|
|||||||
wg.Wait()
|
wg.Wait()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSchedulableAlertRulesRegistry(t *testing.T) {
|
||||||
|
r := schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)}
|
||||||
|
assert.Len(t, r.all(), 0)
|
||||||
|
|
||||||
|
// replace all rules in the registry with foo
|
||||||
|
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "foo", Version: 1}})
|
||||||
|
assert.Len(t, r.all(), 1)
|
||||||
|
foo := r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||||
|
require.NotNil(t, foo)
|
||||||
|
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 1}, *foo)
|
||||||
|
|
||||||
|
// update foo to a newer version
|
||||||
|
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2})
|
||||||
|
assert.Len(t, r.all(), 1)
|
||||||
|
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||||
|
require.NotNil(t, foo)
|
||||||
|
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
|
||||||
|
|
||||||
|
// update bar which does not exist in the registry
|
||||||
|
r.update(&models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1})
|
||||||
|
assert.Len(t, r.all(), 2)
|
||||||
|
foo = r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"})
|
||||||
|
require.NotNil(t, foo)
|
||||||
|
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "foo", Version: 2}, *foo)
|
||||||
|
bar := r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"})
|
||||||
|
require.NotNil(t, foo)
|
||||||
|
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "bar", Version: 1}, *bar)
|
||||||
|
|
||||||
|
// replace all rules in the registry with baz
|
||||||
|
r.set([]*models.SchedulableAlertRule{{OrgID: 1, UID: "baz", Version: 1}})
|
||||||
|
assert.Len(t, r.all(), 1)
|
||||||
|
baz := r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||||
|
require.NotNil(t, baz)
|
||||||
|
assert.Equal(t, models.SchedulableAlertRule{OrgID: 1, UID: "baz", Version: 1}, *baz)
|
||||||
|
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "foo"}))
|
||||||
|
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "bar"}))
|
||||||
|
|
||||||
|
// delete baz
|
||||||
|
deleted, ok := r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||||
|
assert.True(t, ok)
|
||||||
|
require.NotNil(t, deleted)
|
||||||
|
assert.Equal(t, *deleted, *baz)
|
||||||
|
assert.Len(t, r.all(), 0)
|
||||||
|
assert.Nil(t, r.get(models.AlertRuleKey{OrgID: 1, UID: "baz"}))
|
||||||
|
|
||||||
|
// baz cannot be deleted twice
|
||||||
|
deleted, ok = r.del(models.AlertRuleKey{OrgID: 1, UID: "baz"})
|
||||||
|
assert.False(t, ok)
|
||||||
|
assert.Nil(t, deleted)
|
||||||
|
}
|
||||||
|
@ -54,7 +54,7 @@ type schedule struct {
|
|||||||
baseInterval time.Duration
|
baseInterval time.Duration
|
||||||
|
|
||||||
// each alert rule gets its own channel and routine
|
// each alert rule gets its own channel and routine
|
||||||
registry alertRuleRegistry
|
registry alertRuleInfoRegistry
|
||||||
|
|
||||||
maxAttempts int64
|
maxAttempts int64
|
||||||
|
|
||||||
@ -97,6 +97,12 @@ type schedule struct {
|
|||||||
adminConfigPollInterval time.Duration
|
adminConfigPollInterval time.Duration
|
||||||
disabledOrgs map[int64]struct{}
|
disabledOrgs map[int64]struct{}
|
||||||
minRuleInterval time.Duration
|
minRuleInterval time.Duration
|
||||||
|
|
||||||
|
// schedulableAlertRules contains the alert rules that are considered for
|
||||||
|
// evaluation in the current tick. The evaluation of an alert rule in the
|
||||||
|
// current tick depends on its evaluation interval and when it was
|
||||||
|
// last evaluated.
|
||||||
|
schedulableAlertRules schedulableAlertRulesRegistry
|
||||||
}
|
}
|
||||||
|
|
||||||
// SchedulerCfg is the scheduler configuration.
|
// SchedulerCfg is the scheduler configuration.
|
||||||
@ -124,7 +130,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
|
|||||||
ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)
|
ticker := alerting.NewTicker(cfg.C, cfg.BaseInterval, cfg.Metrics.Ticker)
|
||||||
|
|
||||||
sch := schedule{
|
sch := schedule{
|
||||||
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
|
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
|
||||||
maxAttempts: cfg.MaxAttempts,
|
maxAttempts: cfg.MaxAttempts,
|
||||||
clock: cfg.C,
|
clock: cfg.C,
|
||||||
baseInterval: cfg.BaseInterval,
|
baseInterval: cfg.BaseInterval,
|
||||||
@ -148,6 +154,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
|
|||||||
adminConfigPollInterval: cfg.AdminConfigPollInterval,
|
adminConfigPollInterval: cfg.AdminConfigPollInterval,
|
||||||
disabledOrgs: cfg.DisabledOrgs,
|
disabledOrgs: cfg.DisabledOrgs,
|
||||||
minRuleInterval: cfg.MinRuleInterval,
|
minRuleInterval: cfg.MinRuleInterval,
|
||||||
|
schedulableAlertRules: schedulableAlertRulesRegistry{rules: make(map[models.AlertRuleKey]*models.SchedulableAlertRule)},
|
||||||
}
|
}
|
||||||
return &sch
|
return &sch
|
||||||
}
|
}
|
||||||
@ -316,9 +323,17 @@ func (sch *schedule) UpdateAlertRule(key models.AlertRuleKey) {
|
|||||||
|
|
||||||
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
||||||
func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
|
func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
|
||||||
|
// It can happen that the scheduler has deleted the alert rule before the
|
||||||
|
// Ruler API has called DeleteAlertRule. This can happen as requests to
|
||||||
|
// the Ruler API do not hold an exclusive lock over all scheduler operations.
|
||||||
|
if _, ok := sch.schedulableAlertRules.del(key); !ok {
|
||||||
|
sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete the rule routine
|
||||||
ruleInfo, ok := sch.registry.del(key)
|
ruleInfo, ok := sch.registry.del(key)
|
||||||
if !ok {
|
if !ok {
|
||||||
sch.log.Info("unable to delete alert rule routine information by key", "uid", key.UID, "org_id", key.OrgID)
|
sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// stop rule evaluation
|
// stop rule evaluation
|
||||||
@ -364,7 +379,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
|
|||||||
disabledOrgs = append(disabledOrgs, disabledOrg)
|
disabledOrgs = append(disabledOrgs, disabledOrg)
|
||||||
}
|
}
|
||||||
|
|
||||||
alertRules := sch.getAlertRules(ctx, disabledOrgs)
|
if err := sch.updateSchedulableAlertRules(ctx, disabledOrgs); err != nil {
|
||||||
|
sch.log.Error("scheduler failed to update alert rules", "err", err)
|
||||||
|
}
|
||||||
|
alertRules := sch.schedulableAlertRules.all()
|
||||||
|
|
||||||
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
|
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
|
||||||
|
|
||||||
// registeredDefinitions is a map used for finding deleted alert rules
|
// registeredDefinitions is a map used for finding deleted alert rules
|
||||||
|
Loading…
Reference in New Issue
Block a user