mirror of
https://github.com/grafana/grafana.git
synced 2024-11-25 18:30:41 -06:00
Alerting: Add status label to GroupRules metric (#63454)
* Add status label to GroupRules metric * Add state (active and paused) label to GrouRules * Add active/paused metrics tests
This commit is contained in:
parent
f3714099e7
commit
f60dc4441f
@ -7,6 +7,11 @@ import (
|
||||
"github.com/grafana/grafana/pkg/util/ticker"
|
||||
)
|
||||
|
||||
const (
|
||||
AlertRuleActiveLabelValue = "active"
|
||||
AlertRulePausedLabelValue = "paused"
|
||||
)
|
||||
|
||||
type Scheduler struct {
|
||||
Registerer prometheus.Registerer
|
||||
BehindSeconds prometheus.Gauge
|
||||
@ -69,9 +74,9 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
Help: "The number of alert rules that are scheduled, both active and paused.",
|
||||
},
|
||||
[]string{"org"},
|
||||
[]string{"org", "state"},
|
||||
),
|
||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
|
@ -209,12 +209,19 @@ type readyToRunItem struct {
|
||||
}
|
||||
|
||||
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
|
||||
orgs := make(map[int64]int64)
|
||||
orgs := make(map[int64]int64, len(alertRules))
|
||||
orgsPaused := make(map[int64]int64, len(alertRules))
|
||||
for _, rule := range alertRules {
|
||||
orgs[rule.OrgID]++
|
||||
if rule.IsPaused {
|
||||
orgsPaused[rule.OrgID]++
|
||||
}
|
||||
}
|
||||
for org, numRules := range orgs {
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
|
||||
|
||||
for orgID, numRules := range orgs {
|
||||
numRulesPaused := orgsPaused[orgID]
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
|
||||
}
|
||||
|
||||
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
||||
|
@ -114,11 +114,12 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
|
||||
t.Run("after 1st tick rule metrics should report one active alert rule", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
@ -140,11 +141,12 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
|
||||
t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 2
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
@ -180,7 +182,95 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("on 5th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
|
||||
t.Run("on 5th tick an alert rule is paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
alertRule1.IsPaused = true
|
||||
|
||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
|
||||
require.Len(t, scheduled, 1)
|
||||
require.Equal(t, alertRule1, scheduled[0].rule)
|
||||
require.Equal(t, tick, scheduled[0].scheduledAt)
|
||||
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
alertRule2.IsPaused = true
|
||||
|
||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
|
||||
require.Len(t, scheduled, 2)
|
||||
var keys []models.AlertRuleKey
|
||||
for _, item := range scheduled {
|
||||
keys = append(keys, item.rule.GetKey())
|
||||
require.Equal(t, tick, item.scheduledAt)
|
||||
}
|
||||
require.Contains(t, keys, alertRule1.GetKey())
|
||||
require.Contains(t, keys, alertRule2.GetKey())
|
||||
|
||||
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||
|
||||
assertEvalRun(t, evalAppliedCh, tick, keys...)
|
||||
})
|
||||
|
||||
t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 7th tick unpause all alert rules", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
alertRule1.IsPaused = false
|
||||
alertRule2.IsPaused = false
|
||||
|
||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
|
||||
require.Len(t, scheduled, 1)
|
||||
require.Equal(t, alertRule1, scheduled[0].rule)
|
||||
require.Equal(t, tick, scheduled[0].scheduledAt)
|
||||
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 8th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
ruleStore.DeleteRule(alertRule1)
|
||||
@ -195,18 +285,19 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
|
||||
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
|
||||
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
@ -219,7 +310,7 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey())
|
||||
})
|
||||
|
||||
t.Run("on 7th tick a new alert rule should be evaluated", func(t *testing.T) {
|
||||
t.Run("on 10th tick a new alert rule should be evaluated", func(t *testing.T) {
|
||||
// create alert rule with one base interval
|
||||
alertRule3 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(cfg.BaseInterval), models.WithTitle("rule-3"))()
|
||||
ruleStore.PutRule(ctx, alertRule3)
|
||||
|
Loading…
Reference in New Issue
Block a user