mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add rule_group label to grafana_alerting_rule_group_rules metric (#88289)
* Alerting: Add rule_group label to grafana_alerting_rule_group_rules metric (#62361) * Alerting: Delete rule group metrics when the rule group is deleted This commit addresses the issue where the GroupRules metric (a GaugeVec) keeps its value and is not deleted when an alert rule is removed from the rule registry. Previously, when an alert rule with orgID=1 was active, the metric was: grafana_alerting_rule_group_rules{org="1",state="active"} 1 However, after deleting this rule, subsequent calls to updateRulesMetrics did not update the gauge value, causing the metric to incorrectly remain at 1. The fix ensures that when updateRulesMetrics is called it also deletes the group rule metrics with the corresponding label values if needed.
This commit is contained in:
parent
71b56dbb95
commit
149f02aebe
@ -121,7 +121,6 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
// TODO: partition on rule group as well as tenant, similar to loki|cortex.
|
||||
GroupRules: promauto.With(r).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
@ -129,7 +128,7 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of alert rules that are scheduled, both active and paused.",
|
||||
},
|
||||
[]string{"org", "state"},
|
||||
[]string{"org", "state", "rule_group"},
|
||||
),
|
||||
Groups: promauto.With(r).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
|
@ -467,6 +467,11 @@ type AlertRuleGroupKey struct {
|
||||
RuleGroup string
|
||||
}
|
||||
|
||||
type AlertRuleGroupKeyWithFolderFullpath struct {
|
||||
AlertRuleGroupKey
|
||||
FolderFullpath string
|
||||
}
|
||||
|
||||
func (k AlertRuleGroupKey) String() string {
|
||||
return fmt.Sprintf("{orgID: %d, namespaceUID: %s, groupName: %s}", k.OrgID, k.NamespaceUID, k.RuleGroup)
|
||||
}
|
||||
|
@ -32,16 +32,32 @@ func sortedUIDs(alertRules []*models.AlertRule) []string {
|
||||
return uids
|
||||
}
|
||||
|
||||
// updateRulesMetrics updates metrics for alert rules.
|
||||
// Keeps a state in the schedule between calls to delete metrics for rules that are no longer present.
|
||||
func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) {
|
||||
rulesPerOrg := make(map[int64]int64) // orgID -> count
|
||||
orgsPaused := make(map[int64]int64) // orgID -> count
|
||||
orgsNfSettings := make(map[int64]int64) // orgID -> count
|
||||
groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups
|
||||
rulesPerOrgFolderGroup := make(map[models.AlertRuleGroupKeyWithFolderFullpath]int64) // AlertRuleGroupKeyWithFolderFullpath -> count
|
||||
rulesPerOrgFolderGroupPaused := make(map[models.AlertRuleGroupKeyWithFolderFullpath]int64) // AlertRuleGroupKeyWithFolderFullpath -> count
|
||||
orgsNfSettings := make(map[int64]int64) // orgID -> count
|
||||
groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups
|
||||
|
||||
// Remember what orgs and alert groups we process in the current update metrics call,
|
||||
// so we can delete metrics for orgs and groups that are no longer present in the new state.
|
||||
updateMetricsForOrgsAndGroups := map[int64]map[models.AlertRuleGroupKeyWithFolderFullpath]struct{}{} // orgID -> set of AlertRuleGroupWithFolderTitle
|
||||
|
||||
for _, rule := range alertRules {
|
||||
rulesPerOrg[rule.OrgID]++
|
||||
key := models.AlertRuleGroupKeyWithFolderFullpath{
|
||||
AlertRuleGroupKey: rule.GetGroupKey(),
|
||||
FolderFullpath: sch.schedulableAlertRules.folderTitles[rule.GetFolderKey()],
|
||||
}
|
||||
rulesPerOrgFolderGroup[key]++
|
||||
|
||||
if _, ok := updateMetricsForOrgsAndGroups[rule.OrgID]; !ok {
|
||||
updateMetricsForOrgsAndGroups[rule.OrgID] = make(map[models.AlertRuleGroupKeyWithFolderFullpath]struct{})
|
||||
}
|
||||
updateMetricsForOrgsAndGroups[rule.OrgID][key] = struct{}{}
|
||||
|
||||
if rule.IsPaused {
|
||||
orgsPaused[rule.OrgID]++
|
||||
rulesPerOrgFolderGroupPaused[key]++
|
||||
}
|
||||
|
||||
if len(rule.NotificationSettings) > 0 {
|
||||
@ -56,20 +72,59 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) {
|
||||
orgGroups[rule.RuleGroup] = struct{}{}
|
||||
}
|
||||
|
||||
for orgID, numRules := range rulesPerOrg {
|
||||
numRulesPaused := orgsPaused[orgID]
|
||||
numRulesNfSettings := orgsNfSettings[orgID]
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
|
||||
sch.metrics.SimpleNotificationRules.WithLabelValues(fmt.Sprint(orgID)).Set(float64(numRulesNfSettings))
|
||||
for key, numRules := range rulesPerOrgFolderGroup {
|
||||
numRulesPaused := rulesPerOrgFolderGroupPaused[key]
|
||||
ruleGroupLabelValue := makeRuleGroupLabelValue(key)
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(key.OrgID), metrics.AlertRuleActiveLabelValue, ruleGroupLabelValue).Set(float64(numRules - numRulesPaused))
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(key.OrgID), metrics.AlertRulePausedLabelValue, ruleGroupLabelValue).Set(float64(numRulesPaused))
|
||||
}
|
||||
|
||||
for orgID, groups := range groupsPerOrg {
|
||||
sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groups)))
|
||||
for orgID := range updateMetricsForOrgsAndGroups {
|
||||
sch.metrics.SimpleNotificationRules.WithLabelValues(fmt.Sprint(orgID)).Set(float64(orgsNfSettings[orgID]))
|
||||
sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groupsPerOrg[orgID])))
|
||||
}
|
||||
|
||||
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
||||
// scheduled as rules could be removed before we get a chance to evaluate them.
|
||||
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
|
||||
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
|
||||
|
||||
// Delete metrics for rule groups and orgs that are no longer present in the new state
|
||||
for orgID, alertRuleGroupsMap := range sch.lastUpdatedMetricsForOrgsAndGroups {
|
||||
if orgOrGroupDeleted(updateMetricsForOrgsAndGroups, orgID, nil) {
|
||||
sch.metrics.SimpleNotificationRules.DeleteLabelValues(fmt.Sprint(orgID))
|
||||
sch.metrics.Groups.DeleteLabelValues(fmt.Sprint(orgID))
|
||||
}
|
||||
|
||||
for key := range alertRuleGroupsMap {
|
||||
if orgOrGroupDeleted(updateMetricsForOrgsAndGroups, orgID, &key) {
|
||||
ruleGroupLabelValue := makeRuleGroupLabelValue(key)
|
||||
sch.metrics.GroupRules.DeleteLabelValues(fmt.Sprint(key.AlertRuleGroupKey.OrgID), metrics.AlertRuleActiveLabelValue, ruleGroupLabelValue)
|
||||
sch.metrics.GroupRules.DeleteLabelValues(fmt.Sprint(key.AlertRuleGroupKey.OrgID), metrics.AlertRulePausedLabelValue, ruleGroupLabelValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the call state
|
||||
sch.lastUpdatedMetricsForOrgsAndGroups = updateMetricsForOrgsAndGroups
|
||||
}
|
||||
|
||||
// makeRuleGroupLabelValue returns a string that can be used as a label (rule_group) value for alert rule group metrics.
|
||||
func makeRuleGroupLabelValue(key models.AlertRuleGroupKeyWithFolderFullpath) string {
|
||||
return fmt.Sprintf("%s;%s", key.FolderFullpath, key.AlertRuleGroupKey.RuleGroup)
|
||||
}
|
||||
|
||||
// orgOrGroupDeleted returns true if the org or group is no longer present in the new update metrics state.
|
||||
func orgOrGroupDeleted(updateMetrics map[int64]map[models.AlertRuleGroupKeyWithFolderFullpath]struct{}, orgID int64, alertRuleGroupKey *models.AlertRuleGroupKeyWithFolderFullpath) bool {
|
||||
if _, ok := updateMetrics[orgID]; !ok {
|
||||
return true
|
||||
}
|
||||
|
||||
if alertRuleGroupKey != nil {
|
||||
if _, ok := updateMetrics[orgID][*alertRuleGroupKey]; !ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
@ -87,6 +87,10 @@ type schedule struct {
|
||||
featureToggles featuremgmt.FeatureToggles
|
||||
|
||||
metrics *metrics.Scheduler
|
||||
// lastUpdatedMetricsForOrgsAndGroups contains AlertRuleGroupKeyWithFolderFullpaths that
|
||||
// were passed to updateRulesMetrics in the current tick. This is used to
|
||||
// delete metrics for the rules/groups that are not longer present.
|
||||
lastUpdatedMetricsForOrgsAndGroups map[int64]map[ngmodels.AlertRuleGroupKeyWithFolderFullpath]struct{} // orgID -> set of AlertRuleGroupKeyWithFolderFullpath
|
||||
|
||||
alertsSender AlertsSender
|
||||
minRuleInterval time.Duration
|
||||
@ -130,24 +134,25 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
|
||||
}
|
||||
|
||||
sch := schedule{
|
||||
registry: newRuleRegistry(),
|
||||
maxAttempts: cfg.MaxAttempts,
|
||||
clock: cfg.C,
|
||||
baseInterval: cfg.BaseInterval,
|
||||
log: cfg.Log,
|
||||
evaluatorFactory: cfg.EvaluatorFactory,
|
||||
ruleStore: cfg.RuleStore,
|
||||
metrics: cfg.Metrics,
|
||||
appURL: cfg.AppURL,
|
||||
disableGrafanaFolder: cfg.DisableGrafanaFolder,
|
||||
jitterEvaluations: cfg.JitterEvaluations,
|
||||
featureToggles: cfg.FeatureToggles,
|
||||
stateManager: stateManager,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
|
||||
alertsSender: cfg.AlertSender,
|
||||
tracer: cfg.Tracer,
|
||||
recordingWriter: cfg.RecordingWriter,
|
||||
registry: newRuleRegistry(),
|
||||
maxAttempts: cfg.MaxAttempts,
|
||||
clock: cfg.C,
|
||||
baseInterval: cfg.BaseInterval,
|
||||
log: cfg.Log,
|
||||
evaluatorFactory: cfg.EvaluatorFactory,
|
||||
ruleStore: cfg.RuleStore,
|
||||
metrics: cfg.Metrics,
|
||||
lastUpdatedMetricsForOrgsAndGroups: make(map[int64]map[ngmodels.AlertRuleGroupKeyWithFolderFullpath]struct{}),
|
||||
appURL: cfg.AppURL,
|
||||
disableGrafanaFolder: cfg.DisableGrafanaFolder,
|
||||
jitterEvaluations: cfg.JitterEvaluations,
|
||||
featureToggles: cfg.FeatureToggles,
|
||||
stateManager: stateManager,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
schedulableAlertRules: alertRulesRegistry{rules: make(map[ngmodels.AlertRuleKey]*ngmodels.AlertRule)},
|
||||
alertsSender: cfg.AlertSender,
|
||||
tracer: cfg.Tracer,
|
||||
recordingWriter: cfg.RecordingWriter,
|
||||
}
|
||||
|
||||
return &sch
|
||||
|
@ -107,6 +107,8 @@ func TestProcessTicks(t *testing.T) {
|
||||
alertRule1 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval), gen.WithTitle("rule-1")).GenerateRef()
|
||||
ruleStore.PutRule(ctx, alertRule1)
|
||||
|
||||
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
|
||||
|
||||
t.Run("on 1st tick alert rule should be evaluated", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
@ -124,9 +126,9 @@ func TestProcessTicks(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -136,6 +138,8 @@ func TestProcessTicks(t *testing.T) {
|
||||
alertRule2 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(3*cfg.BaseInterval), gen.WithTitle("rule-2")).GenerateRef()
|
||||
ruleStore.PutRule(ctx, alertRule2)
|
||||
|
||||
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
|
||||
|
||||
t.Run("on 2nd tick first alert rule should be evaluated", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
@ -148,13 +152,15 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||
t.Run("after 2nd tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -204,13 +210,15 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) {
|
||||
t.Run("after 5th tick rule metrics should report one active and one paused alert rules in two groups", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1
|
||||
`, alertRule1.OrgID)
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -237,13 +245,15 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, keys...)
|
||||
})
|
||||
|
||||
t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) {
|
||||
t.Run("after 6th tick rule metrics should report two paused alert rules in two groups", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2
|
||||
`, alertRule1.OrgID)
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 1
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -265,13 +275,15 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||
t.Run("after 7th tick rule metrics should report two active alert rules in two groups", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -295,10 +307,10 @@ func TestProcessTicks(t *testing.T) {
|
||||
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
`, alertRule2.OrgID, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
@ -398,6 +410,299 @@ func TestProcessTicks(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestSchedule_updateRulesMetrics(t *testing.T) {
|
||||
ruleStore := newFakeRulesStore()
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
sch := setupScheduler(t, ruleStore, nil, reg, nil, nil)
|
||||
ctx := context.Background()
|
||||
const firstOrgID int64 = 1
|
||||
|
||||
t.Run("grafana_alerting_rule_group_rules metric should reflect the current state", func(t *testing.T) {
|
||||
// Without any rules there are no metrics
|
||||
t.Run("it should not show metrics", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{})
|
||||
|
||||
expectedMetric := ""
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
|
||||
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
|
||||
ruleStore.PutRule(ctx, alertRule1)
|
||||
|
||||
_, err := sch.updateSchedulableAlertRules(ctx) // to update folderTitles
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Run("it should show one active rule in a single group", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
// Add a new rule alertRule2 and check that it is reflected in the metrics
|
||||
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
|
||||
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
|
||||
ruleStore.PutRule(ctx, alertRule2)
|
||||
|
||||
_, err = sch.updateSchedulableAlertRules(ctx) // to update folderTitles
|
||||
require.NoError(t, err)
|
||||
|
||||
t.Run("it should show two active rules in two groups", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
// Now remove the alertRule2
|
||||
t.Run("it should show one active rules in one groups", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[2]s",state="paused"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",rule_group="%[3]s",state="paused"} 0
|
||||
`, alertRule1.OrgID, folderWithRuleGroup1, folderWithRuleGroup2)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
// and remove the alertRule1 so there should be no metrics now
|
||||
t.Run("it should show one active rules in one groups", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{})
|
||||
|
||||
expectedMetric := ""
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
|
||||
const firstOrgID int64 = 1
|
||||
const secondOrgID int64 = 2
|
||||
|
||||
// Without any rules there are no metrics
|
||||
t.Run("it should not show metrics", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{})
|
||||
|
||||
expectedMetric := ""
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
|
||||
|
||||
t.Run("it should show one rule group in a single org", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
|
||||
|
||||
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
grafana_alerting_rule_groups{org="%[2]d"} 1
|
||||
`, alertRule1.OrgID, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
`, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("simple_routing_rules metric should reflect the current state", func(t *testing.T) {
|
||||
const firstOrgID int64 = 1
|
||||
const secondOrgID int64 = 2
|
||||
|
||||
// Has no NotificationSettings, should not be in the metrics
|
||||
alertRuleWithoutNotificationSettings := models.RuleGen.With(
|
||||
models.RuleGen.WithOrgID(firstOrgID),
|
||||
models.RuleGen.WithNoNotificationSettings(),
|
||||
).GenerateRef()
|
||||
|
||||
// Without any rules there are no metrics
|
||||
t.Run("it should not show metrics", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings})
|
||||
|
||||
// Because alertRuleWithoutNotificationSettings.orgID is present,
|
||||
// the metric is also present but set to 0 because the org has no rules with NotificationSettings.
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
|
||||
# TYPE grafana_alerting_simple_routing_rules gauge
|
||||
grafana_alerting_simple_routing_rules{org="%[1]d"} 0
|
||||
`, alertRuleWithoutNotificationSettings.OrgID)
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule1 := models.RuleGen.With(
|
||||
models.RuleGen.WithOrgID(firstOrgID),
|
||||
models.RuleGen.WithNotificationSettingsGen(models.NotificationSettingsGen()),
|
||||
).GenerateRef()
|
||||
|
||||
t.Run("it should show one rule in a single org", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
|
||||
# TYPE grafana_alerting_simple_routing_rules gauge
|
||||
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule2 := models.RuleGen.With(
|
||||
models.RuleGen.WithOrgID(secondOrgID),
|
||||
models.RuleGen.WithNotificationSettingsGen(models.NotificationSettingsGen()),
|
||||
).GenerateRef()
|
||||
|
||||
t.Run("it should show two rules in two orgs", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule1, alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
|
||||
# TYPE grafana_alerting_simple_routing_rules gauge
|
||||
grafana_alerting_simple_routing_rules{org="%[1]d"} 1
|
||||
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
|
||||
`, alertRule1.OrgID, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("after removing one of the rules it should show one present rule and two org", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRuleWithoutNotificationSettings, alertRule2})
|
||||
|
||||
// Because alertRuleWithoutNotificationSettings.orgID is present,
|
||||
// the metric is also present but set to 0 because the org has no rules with NotificationSettings.
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_simple_routing_rules The number of alert rules using simplified routing.
|
||||
# TYPE grafana_alerting_simple_routing_rules gauge
|
||||
grafana_alerting_simple_routing_rules{org="%[1]d"} 0
|
||||
grafana_alerting_simple_routing_rules{org="%[2]d"} 1
|
||||
`, alertRuleWithoutNotificationSettings.OrgID, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("after removing all rules it should not show any metrics", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{})
|
||||
|
||||
expectedMetric := ""
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_simple_routing_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("rule_groups metric should reflect the current state", func(t *testing.T) {
|
||||
const firstOrgID int64 = 1
|
||||
const secondOrgID int64 = 2
|
||||
|
||||
// Without any rules there are no metrics
|
||||
t.Run("it should not show metrics", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{})
|
||||
|
||||
expectedMetric := ""
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule1 := models.RuleGen.With(models.RuleGen.WithOrgID(firstOrgID)).GenerateRef()
|
||||
|
||||
t.Run("it should show one rule group in a single org", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
alertRule2 := models.RuleGen.With(models.RuleGen.WithOrgID(secondOrgID)).GenerateRef()
|
||||
|
||||
t.Run("it should show two rule groups in two orgs", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
grafana_alerting_rule_groups{org="%[2]d"} 1
|
||||
`, alertRule1.OrgID, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("when the first rule is removed it should show one rule group", func(t *testing.T) {
|
||||
sch.updateRulesMetrics([]*models.AlertRule{alertRule2})
|
||||
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_groups The number of alert rule groups
|
||||
# TYPE grafana_alerting_rule_groups gauge
|
||||
grafana_alerting_rule_groups{org="%[1]d"} 1
|
||||
`, alertRule2.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_groups")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func TestSchedule_deleteAlertRule(t *testing.T) {
|
||||
t.Run("when rule exists", func(t *testing.T) {
|
||||
t.Run("it should stop evaluation loop and remove the controller from registry", func(t *testing.T) {
|
||||
|
@ -66,7 +66,7 @@ func (f *fakeRulesStore) GetAlertRulesForScheduling(ctx context.Context, query *
|
||||
query.ResultFoldersTitles = map[models.FolderKey]string{}
|
||||
for _, rule := range f.rules {
|
||||
query.ResultRules = append(query.ResultRules, rule)
|
||||
key := models.FolderKey{OrgID: rule.OrgID, UID: rule.UID}
|
||||
key := models.FolderKey{OrgID: rule.OrgID, UID: rule.NamespaceUID}
|
||||
query.ResultFoldersTitles[key] = f.getNamespaceTitle(rule.NamespaceUID)
|
||||
}
|
||||
return nil
|
||||
|
Loading…
Reference in New Issue
Block a user