Alerting: Add status label to GroupRules metric (#63454)

* Add status label to GroupRules metric

* Add state (active and paused) label to GrouRules

* Add active/paused metrics tests
This commit is contained in:
Alex Moreno 2023-02-23 12:38:27 +01:00 committed by GitHub
parent f3714099e7
commit f60dc4441f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 120 additions and 17 deletions

View File

@ -7,6 +7,11 @@ import (
"github.com/grafana/grafana/pkg/util/ticker"
)
const (
AlertRuleActiveLabelValue = "active"
AlertRulePausedLabelValue = "paused"
)
type Scheduler struct {
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
@ -69,9 +74,9 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_group_rules",
Help: "The number of rules.",
Help: "The number of alert rules that are scheduled, both active and paused.",
},
[]string{"org"},
[]string{"org", "state"},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{

View File

@ -209,12 +209,19 @@ type readyToRunItem struct {
}
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
orgs := make(map[int64]int64)
orgs := make(map[int64]int64, len(alertRules))
orgsPaused := make(map[int64]int64, len(alertRules))
for _, rule := range alertRules {
orgs[rule.OrgID]++
if rule.IsPaused {
orgsPaused[rule.OrgID]++
}
}
for org, numRules := range orgs {
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
for orgID, numRules := range orgs {
numRulesPaused := orgsPaused[orgID]
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
}
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be

View File

@ -114,11 +114,12 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
t.Run("after 1st tick rule metrics should report one active alert rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
@ -140,11 +141,12 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 2
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
@ -180,7 +182,95 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("on 5th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
t.Run("on 5th tick an alert rule is paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule1.IsPaused = true
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule2.IsPaused = true
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 2)
var keys []models.AlertRuleKey
for _, item := range scheduled {
keys = append(keys, item.rule.GetKey())
require.Equal(t, tick, item.scheduledAt)
}
require.Contains(t, keys, alertRule1.GetKey())
require.Contains(t, keys, alertRule2.GetKey())
require.Emptyf(t, stopped, "None rules are expected to be stopped")
assertEvalRun(t, evalAppliedCh, tick, keys...)
})
t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 7th tick unpause all alert rules", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
alertRule1.IsPaused = false
alertRule2.IsPaused = false
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
require.Len(t, scheduled, 1)
require.Equal(t, alertRule1, scheduled[0].rule)
require.Equal(t, tick, scheduled[0].scheduledAt)
require.Emptyf(t, stopped, "None rules are expected to be stopped")
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 8th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
ruleStore.DeleteRule(alertRule1)
@ -195,18 +285,19 @@ func TestProcessTicks(t *testing.T) {
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
})
t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
@ -219,7 +310,7 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey())
})
t.Run("on 7th tick a new alert rule should be evaluated", func(t *testing.T) {
t.Run("on 10th tick a new alert rule should be evaluated", func(t *testing.T) {
// create alert rule with one base interval
alertRule3 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(cfg.BaseInterval), models.WithTitle("rule-3"))()
ruleStore.PutRule(ctx, alertRule3)