Alerting: Don't use a separate collection system for metrics (#75296)

* Alerting: Don't use a separate collection system for metrics The state package had a metric collection system that ran every 15s updating the values of the metrics - there is a common pattern for this in the Prometheus ecosystem called "collectors". I have removed the behaviour of using a time-based interval to "set" the metrics in favour of a set of functions as the "value" that get called at scrape time.
2025-02-25 18:55:37 -06:00 · 2023-09-25 10:27:30 +01:00
parent 1600f75d7e
commit 59694fb2be
6 changed files with 72 additions and 74 deletions
--- a/pkg/services/ngalert/state/manager.go
+++ b/pkg/services/ngalert/state/manager.go
@@ -69,8 +69,14 @@ type ManagerCfg struct {
 }

 func NewManager(cfg ManagerCfg) *Manager {
-	return &Manager{
-		cache:                          newCache(),
+	// Metrics for the cache use a collector, so they need access to the register directly.
+	c := newCache()
+	if cfg.Metrics != nil {
+		c.RegisterMetrics(cfg.Metrics.Registerer())
+	}
+
+	m := &Manager{
+		cache:                          c,
 		ResendDelay:                    ResendDelay, // TODO: make this configurable
 		log:                            cfg.Log,
 		metrics:                        cfg.Metrics,
@@ -84,24 +90,12 @@ func NewManager(cfg ManagerCfg) *Manager {
 		applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
 		tracer:                         cfg.Tracer,
 	}
-}

-func (st *Manager) Run(ctx context.Context) error {
-	if st.applyNoDataAndErrorToAllStates {
-		st.log.Info("Running in alternative execution of Error/NoData mode")
-	}
-	ticker := st.clock.Ticker(MetricsScrapeInterval)
-	for {
-		select {
-		case <-ticker.C:
-			st.log.Debug("Recording state cache metrics", "now", st.clock.Now())
-			st.cache.recordMetrics(st.metrics)
-		case <-ctx.Done():
-			st.log.Debug("Stopping")
-			ticker.Stop()
-			return ctx.Err()
-		}
+	if m.applyNoDataAndErrorToAllStates {
+		m.log.Info("Running in alternative execution of Error/NoData mode")
 	}
+
+	return m
 }

 func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {