Alerting: Move rule_groups_rules metric from State to Scheduler. (#63144)

The `rule_groups_rules` metric is currently defined and computed by `State`. It makes more sense for this metric to be computed off of the configured rule set, not based on the rule evaluation state. There could be an edge condition where a rule does not have a state yet, and so is uncounted. Additionally, we would like this metric (and others), to have a `rule_group` label, and this is much easier to achieve if the metric is produced from the `Scheduler` package.
2025-02-25 18:55:37 -06:00 · 2023-02-09 17:05:19 +01:00
parent 90f8959d3c
commit 4d1a2c3370
5 changed files with 64 additions and 22 deletions
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@@ -177,8 +177,7 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
 	}
 	// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
 	alertRules, _ := sch.schedulableAlertRules.all()
-	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
-	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+	sch.updateRulesMetrics(alertRules)
 }

 func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
@@ -209,6 +208,21 @@ type readyToRunItem struct {
 	evaluation
 }

+func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
+	orgs := make(map[int64]int64)
+	for _, rule := range alertRules {
+		orgs[rule.OrgID]++
+	}
+	for org, numRules := range orgs {
+		sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
+	}
+
+	// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
+	// scheduled as rules could be removed before we get a chance to evaluate them.
+	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
+	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+}
+
 func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.Group, tick time.Time) ([]readyToRunItem, map[ngmodels.AlertRuleKey]struct{}) {
 	tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())

@@ -223,10 +237,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
 	// so, at the end, the remaining registered alert rules are the deleted ones
 	registeredDefinitions := sch.registry.keyMap()

-	// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
-	// scheduled as rules could be removed before we get a chance to evaluate them.
-	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
-	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+	sch.updateRulesMetrics(alertRules)

 	readyToRun := make([]readyToRunItem, 0)
 	missingFolder := make(map[string][]string)