mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add status label to GroupRules metric (#63454)
* Add status label to GroupRules metric * Add state (active and paused) label to GrouRules * Add active/paused metrics tests
This commit is contained in:
parent
f3714099e7
commit
f60dc4441f
@ -7,6 +7,11 @@ import (
|
|||||||
"github.com/grafana/grafana/pkg/util/ticker"
|
"github.com/grafana/grafana/pkg/util/ticker"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
AlertRuleActiveLabelValue = "active"
|
||||||
|
AlertRulePausedLabelValue = "paused"
|
||||||
|
)
|
||||||
|
|
||||||
type Scheduler struct {
|
type Scheduler struct {
|
||||||
Registerer prometheus.Registerer
|
Registerer prometheus.Registerer
|
||||||
BehindSeconds prometheus.Gauge
|
BehindSeconds prometheus.Gauge
|
||||||
@ -69,9 +74,9 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
Subsystem: Subsystem,
|
Subsystem: Subsystem,
|
||||||
Name: "rule_group_rules",
|
Name: "rule_group_rules",
|
||||||
Help: "The number of rules.",
|
Help: "The number of alert rules that are scheduled, both active and paused.",
|
||||||
},
|
},
|
||||||
[]string{"org"},
|
[]string{"org", "state"},
|
||||||
),
|
),
|
||||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
|
@ -209,12 +209,19 @@ type readyToRunItem struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
|
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
|
||||||
orgs := make(map[int64]int64)
|
orgs := make(map[int64]int64, len(alertRules))
|
||||||
|
orgsPaused := make(map[int64]int64, len(alertRules))
|
||||||
for _, rule := range alertRules {
|
for _, rule := range alertRules {
|
||||||
orgs[rule.OrgID]++
|
orgs[rule.OrgID]++
|
||||||
|
if rule.IsPaused {
|
||||||
|
orgsPaused[rule.OrgID]++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for org, numRules := range orgs {
|
|
||||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
|
for orgID, numRules := range orgs {
|
||||||
|
numRulesPaused := orgsPaused[orgID]
|
||||||
|
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
|
||||||
|
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
|
||||||
}
|
}
|
||||||
|
|
||||||
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
||||||
|
@ -114,11 +114,12 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
|
t.Run("after 1st tick rule metrics should report one active alert rule", func(t *testing.T) {
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
# TYPE grafana_alerting_rule_group_rules gauge
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||||
`, alertRule1.OrgID)
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
@ -140,11 +141,12 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
|
t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
# TYPE grafana_alerting_rule_group_rules gauge
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
grafana_alerting_rule_group_rules{org="%[1]d"} 2
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||||
`, alertRule1.OrgID)
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
@ -180,7 +182,95 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("on 5th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
|
t.Run("on 5th tick an alert rule is paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
||||||
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
|
alertRule1.IsPaused = true
|
||||||
|
|
||||||
|
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
|
|
||||||
|
require.Len(t, scheduled, 1)
|
||||||
|
require.Equal(t, alertRule1, scheduled[0].rule)
|
||||||
|
require.Equal(t, tick, scheduled[0].scheduledAt)
|
||||||
|
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||||
|
|
||||||
|
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) {
|
||||||
|
expectedMetric := fmt.Sprintf(
|
||||||
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1
|
||||||
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
||||||
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
|
alertRule2.IsPaused = true
|
||||||
|
|
||||||
|
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
|
|
||||||
|
require.Len(t, scheduled, 2)
|
||||||
|
var keys []models.AlertRuleKey
|
||||||
|
for _, item := range scheduled {
|
||||||
|
keys = append(keys, item.rule.GetKey())
|
||||||
|
require.Equal(t, tick, item.scheduledAt)
|
||||||
|
}
|
||||||
|
require.Contains(t, keys, alertRule1.GetKey())
|
||||||
|
require.Contains(t, keys, alertRule2.GetKey())
|
||||||
|
|
||||||
|
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||||
|
|
||||||
|
assertEvalRun(t, evalAppliedCh, tick, keys...)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) {
|
||||||
|
expectedMetric := fmt.Sprintf(
|
||||||
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2
|
||||||
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("on 7th tick unpause all alert rules", func(t *testing.T) {
|
||||||
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
|
alertRule1.IsPaused = false
|
||||||
|
alertRule2.IsPaused = false
|
||||||
|
|
||||||
|
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
|
|
||||||
|
require.Len(t, scheduled, 1)
|
||||||
|
require.Equal(t, alertRule1, scheduled[0].rule)
|
||||||
|
require.Equal(t, tick, scheduled[0].scheduledAt)
|
||||||
|
require.Emptyf(t, stopped, "None rules are expected to be stopped")
|
||||||
|
|
||||||
|
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) {
|
||||||
|
expectedMetric := fmt.Sprintf(
|
||||||
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||||
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("on 8th tick deleted rule should not be evaluated but stopped", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
ruleStore.DeleteRule(alertRule1)
|
ruleStore.DeleteRule(alertRule1)
|
||||||
@ -195,18 +285,19 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
|
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
|
t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) {
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
`# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused.
|
||||||
# TYPE grafana_alerting_rule_group_rules gauge
|
# TYPE grafana_alerting_rule_group_rules gauge
|
||||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1
|
||||||
|
grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0
|
||||||
`, alertRule1.OrgID)
|
`, alertRule1.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
@ -219,7 +310,7 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey())
|
assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("on 7th tick a new alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 10th tick a new alert rule should be evaluated", func(t *testing.T) {
|
||||||
// create alert rule with one base interval
|
// create alert rule with one base interval
|
||||||
alertRule3 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(cfg.BaseInterval), models.WithTitle("rule-3"))()
|
alertRule3 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(cfg.BaseInterval), models.WithTitle("rule-3"))()
|
||||||
ruleStore.PutRule(ctx, alertRule3)
|
ruleStore.PutRule(ctx, alertRule3)
|
||||||
|
Loading…
Reference in New Issue
Block a user