Alerting/metrics (#33547)

* moves alerting metrics to their own pkg

* adds grafana_alerting_alerts (by state) metric

* alerts_received_{total,invalid}

* embed alertmanager alerting struct in ng metrics & remove duplicated notification metrics (already embed alertmanager notifier metrics)

* use silence metrics from alertmanager lib

* fix - manager has metrics

* updates ngalert tests

* comment lint
Signed-off-by: Owen Diehl <ow.diehl@gmail.com>

* cleaner prom registry code

* removes ngalert global metrics

* new registry use in all tests

* ngalert metrics impl service, hack testinfra code to prevent duplicate metric registrations

* nilmetrics unexported
This commit is contained in:
Owen Diehl
2021-04-30 12:28:06 -04:00
committed by GitHub
parent b45120b999
commit 5e48b54549
15 changed files with 165 additions and 137 deletions

View File

@@ -2,12 +2,14 @@ package state
import (
"fmt"
"strings"
"sync"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
prometheusModel "github.com/prometheus/common/model"
)
@@ -16,12 +18,14 @@ type cache struct {
states map[string]*State
mtxStates sync.RWMutex
log log.Logger
metrics *metrics.Metrics
}
func newCache(logger log.Logger) *cache {
func newCache(logger log.Logger, metrics *metrics.Metrics) *cache {
return &cache{
states: make(map[string]*State),
log: logger,
states: make(map[string]*State),
log: logger,
metrics: metrics,
}
}
@@ -118,13 +122,23 @@ func (c *cache) reset() {
func (c *cache) trim() {
c.mtxStates.Lock()
defer c.mtxStates.Unlock()
ct := make(map[eval.State]int)
for _, v := range c.states {
if len(v.Results) > 100 {
newResults := make([]Evaluation, 100)
copy(newResults, v.Results[100:])
// Keep last 100 results
copy(newResults, v.Results[len(v.Results)-100:])
v.Results = newResults
c.set(v)
}
n := ct[v.State]
ct[v.State] = n + 1
}
for k, n := range ct {
c.metrics.AlertState.WithLabelValues(strings.ToLower(k.String())).Set(float64(n))
}
}

View File

@@ -1,25 +1,29 @@
package state
import (
"fmt"
"time"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
type Manager struct {
cache *cache
quit chan struct{}
Log log.Logger
cache *cache
quit chan struct{}
Log log.Logger
metrics *metrics.Metrics
}
func NewManager(logger log.Logger) *Manager {
func NewManager(logger log.Logger, metrics *metrics.Metrics) *Manager {
manager := &Manager{
cache: newCache(logger),
quit: make(chan struct{}),
Log: logger,
cache: newCache(logger, metrics),
quit: make(chan struct{}),
Log: logger,
metrics: metrics,
}
go manager.cleanUp()
return manager
@@ -95,8 +99,11 @@ func (st *Manager) GetStatesByRuleUID() map[string][]*State {
}
func (st *Manager) cleanUp() {
ticker := time.NewTicker(time.Duration(60) * time.Minute)
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
// TODO: parameterize?
// Setting to a reasonable default scrape interval for Prometheus.
dur := time.Duration(15) * time.Second
ticker := time.NewTicker(dur)
st.Log.Debug("starting cleanup process", "dur", fmt.Sprint(dur))
for {
select {
case <-ticker.C: