Alerting: Add rule_group label to grafana_alerting_rule_group_rules metric (#88289)

* Alerting: Add rule_group label to grafana_alerting_rule_group_rules metric (#62361)

* Alerting: Delete rule group metrics when the rule group is deleted

This commit addresses the issue where the GroupRules metric (a GaugeVec)
keeps its value and is not deleted when an alert rule is removed from the rule registry.
Previously, when an alert rule with orgID=1 was active, the metric was:

  grafana_alerting_rule_group_rules{org="1",state="active"} 1

However, after deleting this rule, subsequent calls to updateRulesMetrics
did not update the gauge value, causing the metric to incorrectly remain at 1.

The fix ensures that when updateRulesMetrics is called it
also deletes the group rule metrics with the corresponding label values if needed.
This commit is contained in:
Alexander Akhmetov 2024-08-13 13:27:23 +02:00 committed by GitHub
parent 71b56dbb95
commit 149f02aebe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 427 additions and 58 deletions

View File

@ -121,7 +121,6 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
// TODO: partition on rule group as well as tenant, similar to loki|cortex.
GroupRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
@ -129,7 +128,7 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Name: "rule_group_rules",
Help: "The number of alert rules that are scheduled, both active and paused.",
},
[]string{"org", "state"},
[]string{"org", "state", "rule_group"},
),
Groups: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{

View File

@ -467,6 +467,11 @@ type AlertRuleGroupKey struct {
RuleGroup string
}
type AlertRuleGroupKeyWithFolderFullpath struct {
AlertRuleGroupKey
FolderFullpath string
}
func (k AlertRuleGroupKey) String() string {
return fmt.Sprintf("{orgID: %d, namespaceUID: %s, groupName: %s}", k.OrgID, k.NamespaceUID, k.RuleGroup)
}

View File

@ -32,16 +32,32 @@ func sortedUIDs(alertRules []*models.AlertRule) []string {
return uids
}
// updateRulesMetrics updates metrics for alert rules.
// Keeps a state in the schedule between calls to delete metrics for rules that are no longer present.
func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) {
rulesPerOrg := make(map[int64]int64) // orgID -> count
orgsPaused := make(map[int64]int64) // orgID -> count
orgsNfSettings := make(map[int64]int64) // orgID -> count
groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups
rulesPerOrgFolderGroup := make(map[models.AlertRuleGroupKeyWithFolderFullpath]int64) // AlertRuleGroupKeyWithFolderFullpath -> count
rulesPerOrgFolderGroupPaused := make(map[models.AlertRuleGroupKeyWithFolderFullpath]int64) // AlertRuleGroupKeyWithFolderFullpath -> count
orgsNfSettings := make(map[int64]int64) // orgID -> count
groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups
// Remember what orgs and alert groups we process in the current update metrics call,
// so we can delete metrics for orgs and groups that are no longer present in the new state.
updateMetricsForOrgsAndGroups := map[int64]map[models.AlertRuleGroupKeyWithFolderFullpath]struct{}{} // orgID -> set of AlertRuleGroupWithFolderTitle
for _, rule := range alertRules {
rulesPerOrg[rule.OrgID]++
key := models.AlertRuleGroupKeyWithFolderFullpath{
AlertRuleGroupKey: rule.GetGroupKey(),
FolderFullpath: sch.schedulableAlertRules.folderTitles[rule.GetFolderKey()],
}
rulesPerOrgFolderGroup[key]++
if _, ok := updateMetricsForOrgsAndGroups[rule.OrgID]; !ok {
updateMetricsForOrgsAndGroups[rule.OrgID] = make(map[models.AlertRuleGroupKeyWithFolderFullpath]struct{})
}
updateMetricsForOrgsAndGroups[rule.OrgID][key] = struct{}{}
if rule.IsPaused {
orgsPaused[rule.OrgID]++
rulesPerOrgFolderGroupPaused[key]++
}
if len(rule.NotificationSettings) > 0 {
@ -56,20 +72,59 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) {
orgGroups[rule.RuleGroup] = struct{}{}
}
for orgID, numRules := range rulesPerOrg {
numRulesPaused := orgsPaused[orgID]
numRulesNfSettings := orgsNfSettings[orgID]
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
sch.metrics.SimpleNotificationRules.WithLabelValues(fmt.Sprint(orgID)).Set(float64(numRulesNfSettings))
for key, numRules := range rulesPerOrgFolderGroup {
numRulesPaused := rulesPerOrgFolderGroupPaused[key]
ruleGroupLabelValue := makeRuleGroupLabelValue(key)
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(key.OrgID), metrics.AlertRuleActiveLabelValue, ruleGroupLabelValue).Set(float64(numRules - numRulesPaused))
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(key.OrgID), metrics.AlertRulePausedLabelValue, ruleGroupLabelValue).Set(float64(numRulesPaused))
}
for orgID, groups := range groupsPerOrg {
sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groups)))
for orgID := range updateMetricsForOrgsAndGroups {
sch.metrics.SimpleNotificationRules.WithLabelValues(fmt.Sprint(orgID)).Set(float64(orgsNfSettings[orgID]))
sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groupsPerOrg[orgID])))
}
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
// Delete metrics for rule groups and orgs that are no longer present in the new state
for orgID, alertRuleGroupsMap := range sch.lastUpdatedMetricsForOrgsAndGroups {
if orgOrGroupDeleted(updateMetricsForOrgsAndGroups, orgID, nil) {
sch.metrics.SimpleNotificationRules.DeleteLabelValues(fmt.Sprint(orgID))
sch.metrics.Groups.DeleteLabelValues(fmt.Sprint(orgID))
}
for key := range alertRuleGroupsMap {
if orgOrGroupDeleted(updateMetricsForOrgsAndGroups, orgID, &key) {
ruleGroupLabelValue := makeRuleGroupLabelValue(key)
sch.metrics.GroupRules.DeleteLabelValues(fmt.Sprint(key.AlertRuleGroupKey.OrgID), metrics.AlertRuleActiveLabelValue, ruleGroupLabelValue)
sch.metrics.GroupRules.DeleteLabelValues(fmt.Sprint(key.AlertRuleGroupKey.OrgID), metrics.AlertRulePausedLabelValue, ruleGroupLabelValue)
}
}
}
// update the call state
sch.lastUpdatedMetricsForOrgsAndGroups = updateMetricsForOrgsAndGroups
}
// makeRuleGroupLabelValue returns a string that can be used as a label (rule_group) value for alert rule group metrics.
func makeRuleGroupLabelValue(key models.AlertRuleGroupKeyWithFolderFullpath) string {
return fmt.Sprintf("%s;%s", key.FolderFullpath, key.AlertRuleGroupKey.RuleGroup)
}
// orgOrGroupDeleted returns true if the org or group is no longer present in the new update metrics state.
func orgOrGroupDeleted(updateMetrics map[int64]map[models.AlertRuleGroupKeyWithFolderFullpath]struct{}, orgID int64, alertRuleGroupKey *models.AlertRuleGroupKeyWithFolderFullpath) bool {
if _, ok := updateMetrics[orgID]; !ok {
return true
}
if alertRuleGroupKey != nil {
if _, ok := updateMetrics[orgID][*alertRuleGroupKey]; !ok {
return true
}
}
return false
}

View File

@ -87,6 +87,10 @@ type schedule struct {
featureToggles featuremgmt.FeatureToggles
metrics *metrics.Scheduler
// lastUpdatedMetricsForOrgsAndGroups contains AlertRuleGroupKeyWithFolderFullpaths that
// were passed to updateRulesMetrics in the current tick. This is used to
// delete metrics for the rules/groups that are not longer present.
lastUpdatedMetricsForOrgsAndGroups map[int64]map[ngmodels.AlertRuleGroupKeyWithFolderFullpath]struct{} // orgID -> set of AlertRuleGroupKeyWithFolderFullpath
alertsSender AlertsSender
minRuleInterval time.Duration
@ -130,24 +134,25 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
}
sch := schedule{
registry: newRuleRegistry(),
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
baseInterval: cfg.BaseInterval,
log: cfg.Log,
evaluatorFactory: cfg.EvaluatorFactory,
ruleStore: cfg.RuleStore,
metrics: cfg.Metrics,
appURL: cfg.AppURL,
disableGrafanaFolder: cfg.DisableGrafanaFolder,
jitterEvaluations: cfg.JitterEvaluations,
featureToggles: cfg.FeatureToggles,
stateManager: stateManager,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
alertsSender: cfg.AlertSender,
tracer: cfg.Tracer,
recordingWriter: cfg.RecordingWriter,
registry: newRuleRegistry(),
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
baseInterval: cfg.BaseInterval,
log: cfg.Log,
evaluatorFactory: cfg.EvaluatorFactory,
ruleStore: cfg.RuleStore,
metrics: cfg.Metrics,
lastUpdatedMetricsForOrgsAndGroups: make(map[int64]map[ngmodels.AlertRuleGroupKeyWithFolderFullpath]struct{}),
appURL: cfg.AppURL,
disableGrafanaFolder: cfg.DisableGrafanaFolder,
jitterEvaluations: cfg.JitterEvaluations,
featureToggles: cfg.FeatureToggles,
stateManager: stateManager,
minRuleInterval: cfg.MinRuleInterval,
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
alertsSender: cfg.AlertSender,
tracer: cfg.Tracer,
recordingWriter: cfg.RecordingWriter,
}
return &sch

View File

@ -107,6 +107,8 @@ func TestProcessTicks(t *testing.T) {
alertRule1 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval), gen.WithTitle("rule-1")).GenerateRef()
ruleStore.PutRule(ctx, alertRule1)
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
t.Run("on 1st tick alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
@ -124,9 +126,9 @@ func TestProcessTicks(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -136,6 +138,8 @@ func TestProcessTicks(t *testing.T) {
alertRule2 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(3*cfg.BaseInterval), gen.WithTitle("rule-2")).GenerateRef()
ruleStore.PutRule(ctx, alertRule2)
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
t.Run("on 2nd tick first alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
@ -148,13 +152,15 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) {
t.Run("after 2nd tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -204,13 +210,15 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) {
t.Run("after 5th tick rule metrics should report one active and one paused alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1
`, alertRule1.OrgID)
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -237,13 +245,15 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, keys...)
})
t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) {
t.Run("after 6th tick rule metrics should report two paused alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2
`, alertRule1.OrgID)
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 1
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -265,13 +275,15 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) {
t.Run("after 7th tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -295,10 +307,10 @@ func TestProcessTicks(t *testing.T) {
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
`, alertRule2.OrgID, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
@ -398,6 +410,299 @@ func TestProcessTicks(t *testing.T) {
})
}
func TestSchedule_updateRulesMetrics(t *testing.T) {
ruleStore := newFakeRulesStore()
reg := prometheus.NewPedanticRegistry()
sch := setupScheduler(t, ruleStore, nil, reg, nil, nil)
ctx := context.Background()
const firstOrgID int64 = 1
t.Run("grafana_alerting_rule_group_rules metric should reflect the current state", func(t *testing.T) {
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{})
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
ruleStore.PutRule(ctx, alertRule1)
_, err := sch.updateSchedulableAlertRules(ctx) // to update folderTitles
require.NoError(t, err)
t.Run("it should show one active rule in a single group", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
// Add a new rule alertRule2 and check that it is reflected in the metrics
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
ruleStore.PutRule(ctx, alertRule2)
_, err = sch.updateSchedulableAlertRules(ctx) // to update folderTitles
require.NoError(t, err)
t.Run("it should show two active rules in two groups", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
// Now remove the alertRule2
t.Run("it should show one active rules in one groups", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
// and remove the alertRule1 so there should be no metrics now
t.Run("it should show one active rules in one groups", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{})
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
})
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{})
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
t.Run("it should show one rule group in a single org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
grafana_alerting_rule_groups{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
})
t.Run("simple_routing_rules metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Has no NotificationSettings, should not be in the metrics
alertRuleWithoutNotificationSettings := models.RuleGen.With(
models.RuleGen.WithOrgID(firstOrgID),
models.RuleGen.WithNoNotificationSettings(),
).GenerateRef()
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings})
// Because alertRuleWithoutNotificationSettings.orgID is present,
// the metric is also present but set to 0 because the org has no rules with NotificationSettings.
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 0
`, alertRuleWithoutNotificationSettings.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
})
alertRule1 := models.RuleGen.With(
models.RuleGen.WithOrgID(firstOrgID),
models.RuleGen.WithNotificationSettingsGen(models.NotificationSettingsGen()),
).GenerateRef()
t.Run("it should show one rule in a single org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
})
alertRule2 := models.RuleGen.With(
models.RuleGen.WithOrgID(secondOrgID),
models.RuleGen.WithNotificationSettingsGen(models.NotificationSettingsGen()),
).GenerateRef()
t.Run("it should show two rules in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
})
t.Run("after removing one of the rules it should show one present rule and two org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule2})
// Because alertRuleWithoutNotificationSettings.orgID is present,
// the metric is also present but set to 0 because the org has no rules with NotificationSettings.
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
# TYPE grafana_alerting_simple_routing_rules gauge
grafana_alerting_simple_routing_rules{org="%[1]d"} 0
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
`, alertRuleWithoutNotificationSettings.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
})
t.Run("after removing all rules it should not show any metrics", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{})
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
require.NoError(t, err)
})
})
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
const firstOrgID int64 = 1
const secondOrgID int64 = 2
// Without any rules there are no metrics
t.Run("it should not show metrics", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{})
expectedMetric := ""
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
t.Run("it should show one rule group in a single org", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
grafana_alerting_rule_groups{org="%[2]d"} 1
`, alertRule1.OrgID, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
sch.updateRulesMetrics([]*models.AlertRule{alertRule2})
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_groups The number of alert rule groups
# TYPE grafana_alerting_rule_groups gauge
grafana_alerting_rule_groups{org="%[1]d"} 1
`, alertRule2.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
require.NoError(t, err)
})
})
}
func TestSchedule_deleteAlertRule(t *testing.T) {
t.Run("when rule exists", func(t *testing.T) {
t.Run("it should stop evaluation loop and remove the controller from registry", func(t *testing.T) {

View File

@ -66,7 +66,7 @@ func (f *fakeRulesStore) GetAlertRulesForScheduling(ctx context.Context, query *
query.ResultFoldersTitles = map[models.FolderKey]string{}
for _, rule := range f.rules {
query.ResultRules = append(query.ResultRules, rule)
key := models.FolderKey{OrgID: rule.OrgID, UID: rule.UID}
key := models.FolderKey{OrgID: rule.OrgID, UID: rule.NamespaceUID}
query.ResultFoldersTitles[key] = f.getNamespaceTitle(rule.NamespaceUID)
}
return nil