Alerting: Schedule Alert rules metric tracking (#50415)

* Alerting: Schedule Alert rules metric tracking Change the record of metrics from one place to two as an attempt to have a semi-accurate record.
2025-02-25 18:55:37 -06:00 · 2022-06-08 18:37:33 +01:00 · 2022-06-08 18:37:33 +01:00 · c59938b235
commit c59938b235
parent 2813e49842
5 changed files with 21 additions and 5 deletions
--- a/pkg/services/ngalert/CHANGELOG.md
+++ b/pkg/services/ngalert/CHANGELOG.md
@ -56,7 +56,12 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
 - [ENHANCEMENT] Create folder 'General Alerting' when Grafana starts from the scratch #48866
 - [ENHANCEMENT] Rule changes authorization logic to use UID folder scope instead of ID scope #48970
 - [ENHANCEMENT] Scheduler: ticker to support stopping #48142
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
+- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled #49874
+  - `grafana_alerting_schedule_alert_rules `
+  - `grafana_alerting_schedule_alert_rules_hash `
+- [CHANGE] Scheduler: Renaming of metrics to make them consistent with similar metrics exposed by the component #49874
+  - `grafana_alerting_get_alert_rules_duration_seconds` to `grafana_alerting_schedule_periodic_duration_seconds`
+  - `grafana_alerting_schedule_periodic_duration_seconds` to `grafana_alerting_schedule_query_alert_rules_duration_seconds`
 - [FEATURE] Indicate whether routes are provisioned when GETting Alertmanager configuration #47857
 - [FEATURE] Indicate whether contact point is provisioned when GETting Alertmanager configuration #48323
 - [FEATURE] Indicate whether alert rule is provisioned when GETting the rule #48458
--- a/pkg/services/ngalert/metrics/ngalert.go
+++ b/pkg/services/ngalert/metrics/ngalert.go
@ -180,7 +180,7 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "schedule_alert_rules",
-				Help:      "The number of alert rules being considered for evaluation each tick.",
+				Help:      "The number of alert rules that could be considered for evaluation at the next tick.",
 			},
 		),
 		SchedulableAlertRulesHash: promauto.With(r).NewGauge(
@ -188,7 +188,7 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "schedule_alert_rules_hash",
-				Help:      "A hash of the alert rules over time.",
+				Help:      "A hash of the alert rules that could be considered for evaluation at the next tick.",
 			}),
 		UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
--- a/pkg/services/ngalert/schedule/fetcher.go
+++ b/pkg/services/ngalert/schedule/fetcher.go
@ -50,7 +50,5 @@ func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOr
 		return fmt.Errorf("failed to get alert rules: %w", err)
 	}
 	sch.schedulableAlertRules.set(q.Result)
-	sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
-	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
 	return nil
 }
--- a/pkg/services/ngalert/schedule/registry.go
+++ b/pkg/services/ngalert/schedule/registry.go
@ -163,6 +163,9 @@ func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule
 	r.rules[rule.GetKey()] = rule
 }

+// del removes pair that has specific key from schedulableAlertRulesRegistry.
+// Returns 2-tuple where the first element is value of the removed pair
+// and the second element indicates whether element with the specified key existed.
 func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -338,6 +338,11 @@ func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
 	}
 	// stop rule evaluation
 	ruleInfo.stop()
+
+	// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
+	alertRules := sch.schedulableAlertRules.all()
+	sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
+	sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
 }

 func (sch *schedule) adminConfigSync(ctx context.Context) error {
@ -392,6 +397,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
 			// so, at the end, the remaining registered alert rules are the deleted ones
 			registeredDefinitions := sch.registry.keyMap()

+			// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
+			// scheduled as rules could be removed before we get a chance to evaluate them.
+			sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
+			sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
+
 			type readyToRunItem struct {
 				key      models.AlertRuleKey
 				ruleName string