mirror of
synced 2025-02-25 18:55:37 -06:00
* Add health fields to rules and an aggregator method to the scheduler * Move health, last error, and last eval time in together to minimize state processing * Wire up a readonly scheduler to prom api * Extract to exported function * Use health in api_prometheus and fix up tests * Rename health struct to status * Fix tests one more time * Several new tests * Handle inactive rules * Push state mapping into state manager * rename to StatusReader * Rectify cyclo complexity rebase * Convert existing package local status implementation to models one * fix tests * undo RuleDefs rename
1127 lines
46 KiB
1127 lines
46 KiB
package schedule
import (
datasources "github.com/grafana/grafana/pkg/services/datasources/fakes"
type evalAppliedInfo struct {
alertDefKey models.AlertRuleKey
now time.Time
func TestProcessTicks(t *testing.T) {
testTracer := tracing.InitializeTracerForTest()
reg := prometheus.NewPedanticRegistry()
testMetrics := metrics.NewNGAlert(reg)
ctx := context.Background()
dispatcherGroup, ctx := errgroup.WithContext(ctx)
ruleStore := newFakeRulesStore()
cfg := setting.UnifiedAlertingSettings{
BaseInterval: 1 * time.Second,
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
const mainOrgID int64 = 1
mockedClock := clock.NewMock()
notifier := NewSyncAlertsSenderMock()
notifier.EXPECT().Send(mock.Anything, mock.Anything, mock.Anything).Return()
appUrl := &url.URL{
Scheme: "http",
Host: "localhost",
cacheServ := &datasources.FakeCacheService{}
evaluator := eval.NewEvaluatorFactory(setting.UnifiedAlertingSettings{}, cacheServ, expr.ProvideService(&setting.Cfg{ExpressionsEnabled: true}, nil, nil, featuremgmt.WithFeatures(), nil, tracing.InitializeTracerForTest()))
rrSet := setting.RecordingRuleSettings{
Enabled: true,
schedCfg := SchedulerCfg{
BaseInterval: cfg.BaseInterval,
C: mockedClock,
AppURL: appUrl,
EvaluatorFactory: evaluator,
RuleStore: ruleStore,
Metrics: testMetrics.GetSchedulerMetrics(),
AlertSender: notifier,
RecordingRulesCfg: rrSet,
Tracer: testTracer,
Log: log.New("ngalert.scheduler"),
managerCfg := state.ManagerCfg{
Metrics: testMetrics.GetStateMetrics(),
ExternalURL: nil,
InstanceStore: nil,
Images: &state.NoopImageService{},
Clock: mockedClock,
Historian: &state.FakeHistorian{},
Tracer: testTracer,
Log: log.New("ngalert.state.manager"),
st := state.NewManager(managerCfg, state.NewNoopPersister())
sched := NewScheduler(schedCfg, st)
evalAppliedCh := make(chan evalAppliedInfo, 1)
stopAppliedCh := make(chan models.AlertRuleKey, 1)
sched.evalAppliedFunc = func(alertDefKey models.AlertRuleKey, now time.Time) {
evalAppliedCh <- evalAppliedInfo{alertDefKey: alertDefKey, now: now}
sched.stopAppliedFunc = func(alertDefKey models.AlertRuleKey) {
stopAppliedCh <- alertDefKey
tick := time.Time{}
gen := models.RuleGen
// create alert rule under main org with one second interval
alertRule1 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval), gen.WithTitle("rule-1")).GenerateRef()
ruleStore.PutRule(ctx, alertRule1)
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
t.Run("before 1st tick status should not be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.False(t, ok, "status for a rule should not be present before the scheduler has created it")
t.Run("on 1st tick alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
t.Run("after 1st tick rule metrics should report one active alert rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("after 1st tick status for rule should be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
// We are only testing that things were scheduled, not that the rule routine worked internally.
// add alert rule under main org with three base intervals
alertRule2 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(3*cfg.BaseInterval), gen.WithTitle("rule-2")).GenerateRef()
ruleStore.PutRule(ctx, alertRule2)
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
t.Run("before 2nd tick status for rule should not be available", func(t *testing.T) {
_, ok := sched.Status(alertRule2.GetKey())
require.False(t, ok, "status for a rule should not be present before the scheduler has created it")
t.Run("on 2nd tick first alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
t.Run("after 2nd tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("on 3rd tick two alert rules should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
var keys []models.AlertRuleKey
for _, item := range scheduled {
keys = append(keys, item.rule.GetKey())
require.Equal(t, tick, item.scheduledAt)
require.Contains(t, keys, alertRule1.GetKey())
require.Contains(t, keys, alertRule2.GetKey())
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, keys...)
t.Run("after 3rd tick status for both rules should be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
_, ok = sched.Status(alertRule2.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
// We are only testing that things were scheduled, not that the rule routine worked internally.
t.Run("on 4th tick only one alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
t.Run("on 5th tick an alert rule is paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule1.IsPaused = true
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
t.Run("after 5th tick rule metrics should report one active and one paused alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("after 5th tick status for both rules should be available regardless of pause state", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
_, ok = sched.Status(alertRule2.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
// We are only testing that things were scheduled, not that the rule routine worked internally.
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule2.IsPaused = true
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
var keys []models.AlertRuleKey
for _, item := range scheduled {
keys = append(keys, item.rule.GetKey())
require.Equal(t, tick, item.scheduledAt)
require.Contains(t, keys, alertRule1.GetKey())
require.Contains(t, keys, alertRule2.GetKey())
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, keys...)
t.Run("after 6th tick rule metrics should report two paused alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("on 7th tick unpause all alert rules", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule1.IsPaused = false
alertRule2.IsPaused = false
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
t.Run("after 7th tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("on 8th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Empty(t, scheduled)
require.Len(t, stopped, 1)
require.Emptyf(t, updated, "None rules are expected to be updated")
require.Contains(t, stopped, alertRule1.GetKey())
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
t.Run("after 8th tick status for deleted rule should not be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.False(t, ok, "status for a rule that was deleted should not be available")
_, ok = sched.Status(alertRule2.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule2, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey())
// create alert rule with one base interval
alertRule3 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval), gen.WithTitle("rule-3")).GenerateRef()
ruleStore.PutRule(ctx, alertRule3)
t.Run("on 10th tick a new alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule3, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
assertEvalRun(t, evalAppliedCh, tick, alertRule3.GetKey())
t.Run("after 10th tick status for remaining rules should be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.False(t, ok, "status for a rule that was deleted should not be available")
_, ok = sched.Status(alertRule2.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
_, ok = sched.Status(alertRule3.GetKey())
require.True(t, ok, "status for a rule that just evaluated was not available")
t.Run("on 11th tick rule2 should be updated", func(t *testing.T) {
newRule2 := models.CopyRule(alertRule2)
expectedUpdated := models.AlertRuleKeyWithVersion{
Version: newRule2.Version,
AlertRuleKey: newRule2.GetKey(),
ruleStore.PutRule(context.Background(), newRule2)
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule3, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Len(t, updated, 1)
require.Equal(t, expectedUpdated, updated[0])
// Add a recording rule with 2 * base interval.
recordingRule1 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(2*cfg.BaseInterval), gen.WithTitle("recording-1"), gen.WithAllRecordingRules()).GenerateRef()
ruleStore.PutRule(ctx, recordingRule1)
t.Run("on 12th tick recording rule and alert rules should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 3)
require.Emptyf(t, stopped, "No rules are expected to be stopped")
require.Emptyf(t, updated, "No rules are expected to be updated")
contains := false
for _, sch := range scheduled {
if sch.rule.Title == recordingRule1.Title {
contains = true
require.True(t, contains, "Expected a scheduled rule with title %s but didn't get one, scheduled rules were %v", recordingRule1.Title, scheduled)
// Update the recording rule.
recordingRule1 = models.CopyRule(recordingRule1)
expectedUpdated := models.AlertRuleKeyWithVersion{
Version: recordingRule1.Version,
AlertRuleKey: recordingRule1.GetKey(),
ruleStore.PutRule(context.Background(), recordingRule1)
t.Run("on 13th tick recording rule should be updated", func(t *testing.T) {
// It has 2 * base interval - so normally it would not have been scheduled for evaluation this tick.
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Emptyf(t, stopped, "No rules are expected to be stopped")
require.Len(t, updated, 1)
require.Equal(t, expectedUpdated, updated[0])
assertScheduledContains(t, scheduled, alertRule3)
t.Run("on 14th tick both 1-tick alert rule and 2-tick recording rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
require.Emptyf(t, stopped, "No rules are expected to be stopped")
require.Emptyf(t, updated, "No rules are expected to be updated")
assertScheduledContains(t, scheduled, alertRule3)
assertScheduledContains(t, scheduled, recordingRule1)
// Convert an alerting rule to a recording rule.
ruleStore.PutRule(ctx, alertRule3)
t.Run("prior to 15th tick alertRule3 should still be scheduled as alerting rule", func(t *testing.T) {
require.Equal(t, models.RuleTypeAlerting, sched.registry.rules[alertRule3.GetKey()].Type())
t.Run("on 15th tick converted rule and 3-tick alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
require.Emptyf(t, stopped, "No rules are expected to be stopped")
// We never sent the Updated command to the restarted rule, so this should be empty.
require.Emptyf(t, updated, "No rules are expected to be updated")
assertScheduledContains(t, scheduled, alertRule2)
assertScheduledContains(t, scheduled, alertRule3) // converted
// Rule in registry should be updated to the correct type.
require.Equal(t, models.RuleTypeRecording, sched.registry.rules[alertRule3.GetKey()].Type())
t.Run("on 16th tick converted rule and 2-tick recording rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
require.Emptyf(t, stopped, "No rules are expected to be stopped")
require.Emptyf(t, updated, "No rules are expected to be updated")
assertScheduledContains(t, scheduled, recordingRule1)
assertScheduledContains(t, scheduled, alertRule3)
t.Run("on 17th tick all rules should be stopped", func(t *testing.T) {
expectedToBeStopped, err := ruleStore.GetAlertRulesKeysForScheduling(ctx)
require.NoError(t, err)
// Remove all rules from store.
ruleStore.rules = map[string]*models.AlertRule{}
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Emptyf(t, scheduled, "None rules should be scheduled")
require.Len(t, stopped, len(expectedToBeStopped))
require.Emptyf(t, updated, "No rules should be updated")
t.Run("after 12th tick no status should be available", func(t *testing.T) {
_, ok := sched.Status(alertRule1.GetKey())
require.False(t, ok, "status for a rule that was deleted should not be available")
_, ok = sched.Status(alertRule2.GetKey())
require.False(t, ok, "status for a rule that just evaluated was not available")
_, ok = sched.Status(alertRule3.GetKey())
require.False(t, ok, "status for a rule that just evaluated was not available")
t.Run("scheduled rules should be sorted", func(t *testing.T) {
rules := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval)).GenerateManyRef(10, 20)
ruleStore.rules = map[string]*models.AlertRule{}
ruleStore.PutRule(context.Background(), rules...)
expectedUids := make([]string, 0, len(rules))
for _, rule := range rules {
expectedUids = append(expectedUids, rule.UID)
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
require.Emptyf(t, updated, "None rules are expected to be updated")
actualUids := make([]string, 0, len(scheduled))
for _, rule := range scheduled {
actualUids = append(actualUids, rule.rule.UID)
require.Len(t, scheduled, len(rules))
assert.Truef(t, slices.IsSorted(actualUids), "The scheduler rules should be sorted by UID but they aren't")
require.Equal(t, expectedUids, actualUids)
func TestSchedule_updateRulesMetrics(t *testing.T) {
ruleStore := newFakeRulesStore()
reg := prometheus.NewPedanticRegistry()
sch := setupScheduler(t, ruleStore, nil, reg, nil, nil)
ctx := context.Background()
const firstOrgID int64 = 1
t.Run("grafana_alerting_rule_group_rules metric should reflect the current state", func(t *testing.T) {
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.ErrorContains(t, err, fmt.Sprintf("expected metric name(s) not found: [%v]", "grafana_alerting_rule_group_rules"))
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
ruleStore.PutRule(ctx, alertRule1)
_, err := sch.updateSchedulableAlertRules(ctx) // to update folderTitles
require.NoError(t, err)
t.Run("it should show one active rule in a single group", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active", type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
// Add a new rule alertRule2 and check that it is reflected in the metrics
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
ruleStore.PutRule(ctx, alertRule2)
_, err = sch.updateSchedulableAlertRules(ctx) // to update folderTitles
require.NoError(t, err)
t.Run("it should show two active rules in two groups", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
// Now remove the alertRule2
t.Run("it should show one active rules in one groups", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, by type and state.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active",type="alerting"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active",type="alerting"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
// and remove the alertRule1 so there should be no metrics now
t.Run("it should show one active rules in one groups", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_rule_group_rules]")
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_rule_groups]")
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
t.Run("it should show one rule group in a single org", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
grafana_alerting_rule_groups{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
t.Run("simple_routing_rules metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Has no NotificationSettings, should not be in the metrics
alertRuleWithoutNotificationSettings := models.RuleGen.With(
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_simple_routing_rules]")
alertRule1 := models.RuleGen.With(
t.Run("it should show one rule in a single org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
alertRule2 := models.RuleGen.With(
t.Run("it should show two rules in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
t.Run("after removing one of the rules it should show one present rule and two org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule2})
// Because alertRuleWithoutNotificationSettings.orgID is present,
// the metric is also present but set to 0 because the org has no rules with NotificationSettings.
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
`, alertRuleWithoutNotificationSettings.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
t.Run("after removing all rules it should not show any metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_simple_routing_rules]")
t.Run("simplified_editor_rules metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
alertRuleWithAdvancedSettings := models.RuleGen.With(
// The rule does not have simplified editor enabled, should not be in the metrics
t.Run("it should not show metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simplified_editor_rules")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_simplified_editor_rules]")
alertRule1 := models.RuleGen.With(
t.Run("it should show one rule in a single org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithAdvancedSettings, alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simplified_editor_rules The number of alert rules using simplified editor settings.
# TYPE grafana_alerting_simplified_editor_rules gauge
grafana_alerting_simplified_editor_rules{org="%[1]d",setting="simplified_query_and_expressions_section"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simplified_editor_rules")
require.NoError(t, err)
alertRule2 := models.RuleGen.With(
t.Run("it should show two rules in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithAdvancedSettings, alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simplified_editor_rules The number of alert rules using simplified editor settings.
# TYPE grafana_alerting_simplified_editor_rules gauge
grafana_alerting_simplified_editor_rules{org="%[1]d",setting="simplified_query_and_expressions_section"} 1
grafana_alerting_simplified_editor_rules{org="%[2]d",setting="simplified_query_and_expressions_section"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simplified_editor_rules")
require.NoError(t, err)
t.Run("after removing one of the rules it should show one present rule and one org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithAdvancedSettings, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simplified_editor_rules The number of alert rules using simplified editor settings.
# TYPE grafana_alerting_simplified_editor_rules gauge
grafana_alerting_simplified_editor_rules{org="%d",setting="simplified_query_and_expressions_section"} 1
`, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simplified_editor_rules")
require.NoError(t, err)
t.Run("after removing all rules it should not show any metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simplified_editor_rules")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_simplified_editor_rules]")
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.ErrorContains(t, err, "expected metric name(s) not found: [grafana_alerting_rule_groups]")
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
t.Run("it should show one rule group in a single org", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
grafana_alerting_rule_groups{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
func TestSchedule_deleteAlertRule(t *testing.T) {
t.Run("when rule exists", func(t *testing.T) {
t.Run("it should stop evaluation loop and remove the controller from registry", func(t *testing.T) {
sch := setupScheduler(t, nil, nil, nil, nil, nil)
ruleFactory := ruleFactoryFromScheduler(sch)
rule := models.RuleGen.GenerateRef()
key := rule.GetKey()
info, _ := sch.registry.getOrCreate(context.Background(), rule, ruleFactory)
require.ErrorIs(t, info.(*alertRule).ctx.Err(), errRuleDeleted)
require.False(t, sch.registry.exists(key))
t.Run("when rule does not exist", func(t *testing.T) {
t.Run("should exit", func(t *testing.T) {
sch := setupScheduler(t, nil, nil, nil, nil, nil)
key := models.GenerateRuleKey(rand.Int63())
func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStore, registry *prometheus.Registry, senderMock *SyncAlertsSenderMock, evalMock eval.EvaluatorFactory) *schedule {
testTracer := tracing.InitializeTracerForTest()
mockedClock := clock.NewMock()
if rs == nil {
rs = newFakeRulesStore()
if is == nil {
is = &state.FakeInstanceStore{}
var evaluator = evalMock
if evalMock == nil {
evaluator = eval.NewEvaluatorFactory(setting.UnifiedAlertingSettings{}, &datasources.FakeCacheService{}, expr.ProvideService(&setting.Cfg{ExpressionsEnabled: true}, nil, nil, featuremgmt.WithFeatures(), nil, tracing.InitializeTracerForTest()))
if registry == nil {
registry = prometheus.NewPedanticRegistry()
m := metrics.NewNGAlert(registry)
appUrl := &url.URL{
Scheme: "http",
Host: "localhost",
if senderMock == nil {
senderMock = NewSyncAlertsSenderMock()
senderMock.EXPECT().Send(mock.Anything, mock.Anything, mock.Anything).Return()
cfg := setting.UnifiedAlertingSettings{
BaseInterval: time.Second,
MaxAttempts: 1,
RecordingRules: setting.RecordingRuleSettings{
Enabled: true,
fakeRecordingWriter := writer.FakeWriter{}
schedCfg := SchedulerCfg{
BaseInterval: cfg.BaseInterval,
MaxAttempts: cfg.MaxAttempts,
C: mockedClock,
AppURL: appUrl,
EvaluatorFactory: evaluator,
RuleStore: rs,
RecordingRulesCfg: cfg.RecordingRules,
Metrics: m.GetSchedulerMetrics(),
AlertSender: senderMock,
Tracer: testTracer,
Log: log.New("ngalert.scheduler"),
RecordingWriter: fakeRecordingWriter,
managerCfg := state.ManagerCfg{
Metrics: m.GetStateMetrics(),
ExternalURL: nil,
InstanceStore: is,
Images: &state.NoopImageService{},
Clock: mockedClock,
Historian: &state.FakeHistorian{},
Tracer: testTracer,
Log: log.New("ngalert.state.manager"),
MaxStateSaveConcurrency: 1,
syncStatePersister := state.NewSyncStatePersisiter(log.New("ngalert.state.manager.perist"), managerCfg)
st := state.NewManager(managerCfg, syncStatePersister)
return NewScheduler(schedCfg, st)
func withQueryForState(t *testing.T, evalResult eval.State) models.AlertRuleMutator {
var expression string
var forMultimplier int64 = 0
switch evalResult {
case eval.Normal:
expression = `{
"datasourceUid": "__expr__",
"expression":"2 + 1 < 1"
case eval.Pending, eval.Alerting:
expression = `{
"datasourceUid": "__expr__",
"expression":"2 + 2 > 1"
if evalResult == eval.Pending {
forMultimplier = rand.Int63n(9) + 1
case eval.Error:
expression = `{
"datasourceUid": "__expr__",
require.Fail(t, fmt.Sprintf("Alert rule with desired evaluation result '%s' is not supported yet", evalResult))
return func(rule *models.AlertRule) {
rule.Condition = "A"
rule.Data = []models.AlertQuery{
DatasourceUID: expr.DatasourceUID,
Model: json.RawMessage(expression),
RelativeTimeRange: models.RelativeTimeRange{
From: models.Duration(5 * time.Hour),
To: models.Duration(3 * time.Hour),
RefID: "A",
rule.For = time.Duration(rule.IntervalSeconds*forMultimplier) * time.Second
func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys ...models.AlertRuleKey) {
timeout := time.After(time.Second)
expected := make(map[models.AlertRuleKey]struct{}, len(keys))
for _, k := range keys {
expected[k] = struct{}{}
for {
select {
case info := <-ch:
_, ok := expected[info.alertDefKey]
if !ok {
t.Fatalf("alert rule: %v should not have been evaluated at: %v", info.alertDefKey, info.now)
t.Logf("alert rule: %v evaluated at: %v", info.alertDefKey, info.now)
assert.Equal(t, tick, info.now)
delete(expected, info.alertDefKey)
if len(expected) == 0 {
case <-timeout:
if len(expected) == 0 {
t.Fatal("cycle has expired")
func assertStopRun(t *testing.T, ch <-chan models.AlertRuleKey, keys ...models.AlertRuleKey) {
timeout := time.After(time.Second)
expected := make(map[models.AlertRuleKey]struct{}, len(keys))
for _, k := range keys {
expected[k] = struct{}{}
for {
select {
case alertDefKey := <-ch:
_, ok := expected[alertDefKey]
t.Logf("alert rule: %v stopped", alertDefKey)
assert.True(t, ok)
delete(expected, alertDefKey)
if len(expected) == 0 {
case <-timeout:
if len(expected) == 0 {
t.Fatal("cycle has expired")
func assertScheduledContains(t *testing.T, scheduled []readyToRunItem, rule *models.AlertRule) {
contains := false
for _, sch := range scheduled {
if sch.rule.GetKey() == rule.GetKey() {
contains = true
require.True(t, contains, "Expected a scheduled rule with key %s title %s but didn't get one, scheduled rules were %v", rule.GetKey(), rule.Title, scheduled)