Alerting: Move rule_groups_rules metric from State to Scheduler. (#63144)

The `rule_groups_rules` metric is currently defined and computed by `State`. It makes more sense for this metric to be computed off of the configured rule set, not based on the rule evaluation state. There could be an edge condition where a rule does not have a state yet, and so is uncounted. Additionally, we would like this metric (and others), to have a `rule_group` label, and this is much easier to achieve if the metric is produced from the `Scheduler` package.
2025-02-25 18:55:37 -06:00 · 2023-02-09 17:05:19 +01:00 · 2023-02-09 17:05:19 +01:00 · 4d1a2c3370
commit 4d1a2c3370
parent 90f8959d3c
5 changed files with 64 additions and 22 deletions
--- a/pkg/services/ngalert/metrics/scheduler.go
+++ b/pkg/services/ngalert/metrics/scheduler.go
@ -13,6 +13,7 @@ type Scheduler struct {
 	EvalTotal                           *prometheus.CounterVec
 	EvalFailures                        *prometheus.CounterVec
 	EvalDuration                        *prometheus.HistogramVec
+	GroupRules                          *prometheus.GaugeVec
 	SchedulePeriodicDuration            prometheus.Histogram
 	SchedulableAlertRules               prometheus.Gauge
 	SchedulableAlertRulesHash           prometheus.Gauge
@ -62,6 +63,16 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 			},
 			[]string{"org"},
 		),
+		// TODO: partition on rule group as well as tenant, similar to loki|cortex.
+		GroupRules: promauto.With(r).NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: Namespace,
+				Subsystem: Subsystem,
+				Name:      "rule_group_rules",
+				Help:      "The number of rules.",
+			},
+			[]string{"org"},
+		),
 		SchedulePeriodicDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
--- a/pkg/services/ngalert/metrics/state.go
+++ b/pkg/services/ngalert/metrics/state.go
@ -6,23 +6,11 @@ import (
 )

 type State struct {
-	GroupRules *prometheus.GaugeVec
 	AlertState *prometheus.GaugeVec
 }

 func NewStateMetrics(r prometheus.Registerer) *State {
 	return &State{
-		// TODO: once rule groups support multiple rules, consider partitioning
-		// on rule group as well as tenant, similar to loki|cortex.
-		GroupRules: promauto.With(r).NewGaugeVec(
-			prometheus.GaugeOpts{
-				Namespace: Namespace,
-				Subsystem: Subsystem,
-				Name:      "rule_group_rules",
-				Help:      "The number of rules.",
-			},
-			[]string{"org"},
-		),
 		AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
 			Namespace: Namespace,
 			Subsystem: Subsystem,
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -177,8 +177,7 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
 	}
 	// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
 	alertRules, _ := sch.schedulableAlertRules.all()
-	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
-	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+	sch.updateRulesMetrics(alertRules)
 }

 func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
@ -209,6 +208,21 @@ type readyToRunItem struct {
 	evaluation
 }

+func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
+	orgs := make(map[int64]int64)
+	for _, rule := range alertRules {
+		orgs[rule.OrgID]++
+	}
+	for org, numRules := range orgs {
+		sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
+	}
+
+	// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
+	// scheduled as rules could be removed before we get a chance to evaluate them.
+	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
+	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+}
+
 func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.Group, tick time.Time) ([]readyToRunItem, map[ngmodels.AlertRuleKey]struct{}) {
 	tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())

@ -223,10 +237,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
 	// so, at the end, the remaining registered alert rules are the deleted ones
 	registeredDefinitions := sch.registry.keyMap()

-	// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
-	// scheduled as rules could be removed before we get a chance to evaluate them.
-	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
-	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+	sch.updateRulesMetrics(alertRules)

 	readyToRun := make([]readyToRunItem, 0)
 	missingFolder := make(map[string][]string)
--- a/pkg/services/ngalert/schedule/schedule_unit_test.go
+++ b/pkg/services/ngalert/schedule/schedule_unit_test.go
@ -40,7 +40,8 @@ type evalAppliedInfo struct {

 func TestProcessTicks(t *testing.T) {
 	testTracer := tracing.InitializeTracerForTest()
-	testMetrics := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
+	reg := prometheus.NewPedanticRegistry()
+	testMetrics := metrics.NewNGAlert(reg)
 	ctx := context.Background()
 	dispatcherGroup, ctx := errgroup.WithContext(ctx)

@ -113,6 +114,17 @@ func TestProcessTicks(t *testing.T) {
 		assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
 	})

+	t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
+		expectedMetric := fmt.Sprintf(
+			`# HELP grafana_alerting_rule_group_rules The number of rules.
+        	            	# TYPE grafana_alerting_rule_group_rules gauge
+        	            	grafana_alerting_rule_group_rules{org="%[1]d"} 1
+				`, alertRule1.OrgID)
+
+		err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
+		require.NoError(t, err)
+	})
+
 	// add alert rule under main org with three base intervals
 	alertRule2 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(3*cfg.BaseInterval), models.WithTitle("rule-2"))()
 	ruleStore.PutRule(ctx, alertRule2)
@ -128,6 +140,17 @@ func TestProcessTicks(t *testing.T) {
 		assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
 	})

+	t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
+		expectedMetric := fmt.Sprintf(
+			`# HELP grafana_alerting_rule_group_rules The number of rules.
+        	            	# TYPE grafana_alerting_rule_group_rules gauge
+        	            	grafana_alerting_rule_group_rules{org="%[1]d"} 2
+				`, alertRule1.OrgID)
+
+		err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
+		require.NoError(t, err)
+	})
+
 	t.Run("on 3rd tick two alert rules should be evaluated", func(t *testing.T) {
 		tick = tick.Add(cfg.BaseInterval)
 		scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
@ -172,6 +195,17 @@ func TestProcessTicks(t *testing.T) {
 		assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
 	})

+	t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
+		expectedMetric := fmt.Sprintf(
+			`# HELP grafana_alerting_rule_group_rules The number of rules.
+        	            	# TYPE grafana_alerting_rule_group_rules gauge
+        	            	grafana_alerting_rule_group_rules{org="%[1]d"} 1
+				`, alertRule1.OrgID)
+
+		err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
+		require.NoError(t, err)
+	})
+
 	t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
 		tick = tick.Add(cfg.BaseInterval)

--- a/pkg/services/ngalert/state/cache.go
+++ b/pkg/services/ngalert/state/cache.go
@ -2,7 +2,6 @@ package state

 import (
 	"context"
-	"fmt"
 	"math"
 	"net/url"
 	"strings"
@ -282,8 +281,7 @@ func (c *cache) recordMetrics(metrics *metrics.State) {
 		eval.Error:    0,
 	}

-	for org, orgMap := range c.states {
-		metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
+	for _, orgMap := range c.states {
 		for _, rule := range orgMap {
 			for _, state := range rule.states {
 				n := ct[state.State]