Alerting/ruler metrics (#34144)

* adds active configurations metric * rule evaluation metrics * ruler metrics * pr feedback
2025-02-25 18:55:37 -06:00 · 2021-05-14 16:13:44 -04:00
parent eb74994b8b
commit 1367f7171e
9 changed files with 99 additions and 26 deletions
--- a/pkg/services/ngalert/metrics/metrics.go
+++ b/pkg/services/ngalert/metrics/metrics.go
@@ -29,8 +29,13 @@ type Metrics struct {
 	*metrics.Alerts
 	AlertState *prometheus.GaugeVec
 	// Registerer is for use by subcomponents which register their own metrics.
-	Registerer      prometheus.Registerer
-	RequestDuration *prometheus.HistogramVec
+	Registerer           prometheus.Registerer
+	RequestDuration      *prometheus.HistogramVec
+	ActiveConfigurations prometheus.Gauge
+	EvalTotal            *prometheus.CounterVec
+	EvalFailures         *prometheus.CounterVec
+	EvalDuration         *prometheus.SummaryVec
+	GroupRules           *prometheus.GaugeVec
 }

 func init() {
@@ -68,6 +73,54 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
 			},
 			[]string{"method", "route", "status_code", "backend"},
 		),
+		ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
+			Namespace: "grafana",
+			Subsystem: "alerting",
+			Name:      "active_configurations",
+			Help:      "The number of active, non default alertmanager configurations for grafana managed alerts",
+		}),
+		// TODO: once rule groups support multiple rules, consider partitioning
+		// on rule group as well as tenant, similar to loki|cortex.
+		EvalTotal: prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace: "grafana",
+				Subsystem: "alerting",
+				Name:      "rule_evaluations_total",
+				Help:      "The total number of rule evaluations.",
+			},
+			[]string{"user"},
+		),
+		// TODO: once rule groups support multiple rules, consider partitioning
+		// on rule group as well as tenant, similar to loki|cortex.
+		EvalFailures: prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace: "grafana",
+				Subsystem: "alerting",
+				Name:      "rule_evaluation_failures_total",
+				Help:      "The total number of rule evaluation failures.",
+			},
+			[]string{"user"},
+		),
+		EvalDuration: prometheus.NewSummaryVec(
+			prometheus.SummaryOpts{
+				Namespace:  "grafana",
+				Subsystem:  "alerting",
+				Help:       "The duration for a rule to execute.",
+				Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
+			},
+			[]string{"user"},
+		),
+		// TODO: once rule groups support multiple rules, consider partitioning
+		// on rule group as well as tenant, similar to loki|cortex.
+		GroupRules: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: "grafana",
+				Subsystem: "alerting",
+				Name:      "rule_group_rules",
+				Help:      "The number of rules.",
+			},
+			[]string{"user"},
+		),
 	}
 }

--- a/pkg/services/ngalert/ngalert.go
+++ b/pkg/services/ngalert/ngalert.go
@@ -65,7 +65,11 @@ func (ng *AlertNG) Init() error {
 	ng.stateManager = state.NewManager(ng.Log, ng.Metrics)
 	baseInterval := baseIntervalSeconds * time.Second

-	store := &store.DBstore{BaseInterval: baseInterval, DefaultIntervalSeconds: defaultIntervalSeconds, SQLStore: ng.SQLStore}
+	store := &store.DBstore{
+		BaseInterval:           baseInterval,
+		DefaultIntervalSeconds: defaultIntervalSeconds,
+		SQLStore:               ng.SQLStore,
+	}

 	var err error
 	ng.Alertmanager, err = notifier.New(ng.Cfg, store, ng.Metrics)
@@ -82,6 +86,7 @@ func (ng *AlertNG) Init() error {
 		InstanceStore: store,
 		RuleStore:     store,
 		Notifier:      ng.Alertmanager,
+		Metrics:       ng.Metrics,
 	}
 	ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService)

--- a/pkg/services/ngalert/notifier/alertmanager.go
+++ b/pkg/services/ngalert/notifier/alertmanager.go
@@ -212,6 +212,7 @@ func (am *Alertmanager) SaveAndApplyConfig(cfg *apimodels.PostableUserConfig) er
 	if err != nil {
 		return err
 	}
+	am.Metrics.ActiveConfigurations.Set(1)

 	return nil
 }
@@ -253,6 +254,12 @@ func (am *Alertmanager) SyncAndApplyConfigFromDatabase() error {
 		return fmt.Errorf("unable to reload configuration: %w", err)
 	}

+	if q.Result.Default {
+		am.Metrics.ActiveConfigurations.Set(0)
+	} else {
+		am.Metrics.ActiveConfigurations.Set(1)
+	}
+
 	return nil
 }

--- a/pkg/services/ngalert/notifier/alertmanager_test.go
+++ b/pkg/services/ngalert/notifier/alertmanager_test.go
@@ -35,6 +35,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
 		DataPath: dir,
 	}

+	m := metrics.NewMetrics(prometheus.NewRegistry())
 	sqlStore := sqlstore.InitTestDB(t)
 	store := &store.DBstore{
 		BaseInterval:           10 * time.Second,
@@ -42,7 +43,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
 		SQLStore:               sqlStore,
 	}

-	am, err := New(cfg, store, metrics.NewMetrics(prometheus.NewRegistry()))
+	am, err := New(cfg, store, m)
 	require.NoError(t, err)
 	return am
 }
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@@ -8,6 +8,7 @@ import (

 	"github.com/benbjohnson/clock"
 	apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
+	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	"golang.org/x/sync/errgroup"

 	"github.com/grafana/grafana/pkg/infra/log"
@@ -39,7 +40,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
 	sch.log.Debug("alert rule routine started", "key", key)

 	evalRunning := false
-	var start, end time.Time
 	var attempt int64
 	var alertRule *models.AlertRule
 	for {
@@ -50,7 +50,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
 			}

 			evaluate := func(attempt int64) error {
-				start = timeNow()
+				start := timeNow()

 				// fetch latest alert rule version
 				if alertRule == nil || alertRule.Version < ctx.version {
@@ -70,8 +70,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
 					Data:      alertRule.Data,
 				}
 				results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
-				end = timeNow()
+				var (
+					end    = timeNow()
+					tenant = fmt.Sprint(alertRule.OrgID)
+					dur    = end.Sub(start).Seconds()
+				)
+
+				sch.metrics.EvalTotal.WithLabelValues(tenant).Inc()
+				sch.metrics.EvalDuration.WithLabelValues(tenant).Observe(dur)
 				if err != nil {
+					sch.metrics.EvalFailures.WithLabelValues(tenant).Inc()
 					// consider saving alert instance on error
 					sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
 						"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
@@ -153,6 +161,7 @@ type schedule struct {
 	dataService *tsdb.Service

 	notifier Notifier
+	metrics  *metrics.Metrics
 }

 // SchedulerCfg is the scheduler configuration.
@@ -167,6 +176,7 @@ type SchedulerCfg struct {
 	RuleStore       store.RuleStore
 	InstanceStore   store.InstanceStore
 	Notifier        Notifier
+	Metrics         *metrics.Metrics
 }

 // NewScheduler returns a new schedule.
@@ -186,6 +196,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule {
 		instanceStore:   cfg.InstanceStore,
 		dataService:     dataService,
 		notifier:        cfg.Notifier,
+		metrics:         cfg.Metrics,
 	}
 	return &sch
 }
--- a/pkg/services/ngalert/schedule/schedule_test.go
+++ b/pkg/services/ngalert/schedule/schedule_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	"github.com/grafana/grafana/pkg/services/ngalert/tests"
+	"github.com/prometheus/client_golang/prometheus"

 	"github.com/grafana/grafana/pkg/services/ngalert/state"

@@ -104,6 +105,7 @@ func TestWarmStateCache(t *testing.T) {

 		RuleStore:     dbstore,
 		InstanceStore: dbstore,
+		Metrics:       metrics.NewMetrics(prometheus.NewRegistry()),
 	}
 	sched := schedule.NewScheduler(schedCfg, nil)
 	st := state.NewManager(schedCfg.Logger, nilMetrics)
@@ -151,6 +153,7 @@ func TestAlertingTicker(t *testing.T) {
 		RuleStore:     dbstore,
 		InstanceStore: dbstore,
 		Logger:        log.New("ngalert schedule test"),
+		Metrics:       metrics.NewMetrics(prometheus.NewRegistry()),
 	}
 	sched := schedule.NewScheduler(schedCfg, nil)

--- a/pkg/services/ngalert/state/cache.go
+++ b/pkg/services/ngalert/state/cache.go
@@ -149,8 +149,9 @@ func (c *cache) trim() {
 		eval.Error:    0,
 	}

-	for _, org := range c.states {
-		for _, rule := range org {
+	for org, orgMap := range c.states {
+		c.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
+		for _, rule := range orgMap {
 			for _, state := range rule {
 				if len(state.Results) > 100 {
 					newResults := make([]Evaluation, 100)
--- a/pkg/services/ngalert/store/alertmanager.go
+++ b/pkg/services/ngalert/store/alertmanager.go
@@ -13,29 +13,21 @@ var (
 	ErrNoAlertmanagerConfiguration = fmt.Errorf("could not find an Alertmanager configuration")
 )

-func getLatestAlertmanagerConfiguration(sess *sqlstore.DBSession) (*models.AlertConfiguration, error) {
-	c := &models.AlertConfiguration{}
-	// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
-	ok, err := sess.Desc("id").Limit(1).Get(c)
-	if err != nil {
-		return nil, err
-	}
-
-	if !ok {
-		return nil, ErrNoAlertmanagerConfiguration
-	}
-
-	return c, nil
-}
-
 // GetLatestAlertmanagerConfiguration returns the lastest version of the alertmanager configuration.
 // It returns ErrNoAlertmanagerConfiguration if no configuration is found.
 func (st *DBstore) GetLatestAlertmanagerConfiguration(query *models.GetLatestAlertmanagerConfigurationQuery) error {
 	return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
-		c, err := getLatestAlertmanagerConfiguration(sess)
+		c := &models.AlertConfiguration{}
+		// The ID is already an auto incremental column, using the ID as an order should guarantee the latest.
+		ok, err := sess.Desc("id").Limit(1).Get(c)
 		if err != nil {
 			return err
 		}
+
+		if !ok {
+			return ErrNoAlertmanagerConfiguration
+		}
+
 		query.Result = c
 		return nil
 	})
--- a/pkg/services/ngalert/store/database.go
+++ b/pkg/services/ngalert/store/database.go
@@ -27,5 +27,5 @@ type DBstore struct {
 	BaseInterval time.Duration
 	// default alert definiiton interval
 	DefaultIntervalSeconds int64
-	SQLStore               *sqlstore.SQLStore `inject:""`
+	SQLStore               *sqlstore.SQLStore
 }