Alerting: Don't use a separate collection system for metrics (#75296)

* Alerting: Don't use a separate collection system for metrics

The state package had a metric collection system that ran every 15s updating the values of the metrics - there is a common pattern for this in the Prometheus ecosystem called "collectors".

I have removed the behaviour of using a time-based interval to "set" the metrics in favour of a set of functions as the "value" that get called at scrape time.
This commit is contained in:
gotjosh
2023-09-25 10:27:30 +01:00
committed by GitHub
parent 1600f75d7e
commit 59694fb2be
6 changed files with 72 additions and 74 deletions

View File

@@ -69,8 +69,14 @@ type ManagerCfg struct {
}
func NewManager(cfg ManagerCfg) *Manager {
return &Manager{
cache: newCache(),
// Metrics for the cache use a collector, so they need access to the register directly.
c := newCache()
if cfg.Metrics != nil {
c.RegisterMetrics(cfg.Metrics.Registerer())
}
m := &Manager{
cache: c,
ResendDelay: ResendDelay, // TODO: make this configurable
log: cfg.Log,
metrics: cfg.Metrics,
@@ -84,24 +90,12 @@ func NewManager(cfg ManagerCfg) *Manager {
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
tracer: cfg.Tracer,
}
}
func (st *Manager) Run(ctx context.Context) error {
if st.applyNoDataAndErrorToAllStates {
st.log.Info("Running in alternative execution of Error/NoData mode")
}
ticker := st.clock.Ticker(MetricsScrapeInterval)
for {
select {
case <-ticker.C:
st.log.Debug("Recording state cache metrics", "now", st.clock.Now())
st.cache.recordMetrics(st.metrics)
case <-ctx.Done():
st.log.Debug("Stopping")
ticker.Stop()
return ctx.Err()
}
if m.applyNoDataAndErrorToAllStates {
m.log.Info("Running in alternative execution of Error/NoData mode")
}
return m
}
func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {