grafana/pkg/services/ngalert/metrics/scheduler.go
Steve Simpson ad7f804255
Alerting: Fix evaluation metrics to not count retries (#85873)
* Change evaluation metrics to only count once per eval, and add new metrics.

* Cosmetic: Move eval total Inc() to orginal place.
2024-04-12 16:20:46 +02:00

188 lines
6.4 KiB
Go

package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/grafana/grafana/pkg/util/ticker"
)
const (
AlertRuleActiveLabelValue = "active"
AlertRulePausedLabelValue = "paused"
)
type Scheduler struct {
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.HistogramVec
EvalAttemptTotal *prometheus.CounterVec
EvalAttemptFailures *prometheus.CounterVec
ProcessDuration *prometheus.HistogramVec
SendDuration *prometheus.HistogramVec
SimpleNotificationRules *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
Groups *prometheus.GaugeVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
UpdateSchedulableAlertRulesDuration prometheus.Histogram
Ticker *ticker.Metrics
EvaluationMissed *prometheus.CounterVec
}
func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
return &Scheduler{
Registerer: r,
BehindSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "scheduler_behind_seconds",
Help: "The total number of seconds the scheduler is behind.",
}),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalTotal: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluations_total",
Help: "The total number of rule evaluations.",
},
[]string{"org"},
),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalFailures: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.",
},
[]string{"org"},
),
EvalDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds",
Help: "The time to evaluate a rule.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),
EvalAttemptTotal: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_attempts_total",
Help: "The total number of rule evaluation attempts.",
},
[]string{"org"},
),
EvalAttemptFailures: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_attempt_failures_total",
Help: "The total number of rule evaluation attempt failures.",
},
[]string{"org"},
),
ProcessDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_process_evaluation_duration_seconds",
Help: "The time to process the evaluation results for a rule.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),
SendDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_send_alerts_duration_seconds",
Help: "The time to send the alerts to Alertmanager.",
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
},
[]string{"org"},
),
SimpleNotificationRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "simple_routing_rules",
Help: "The number of alert rules using simplified routing.",
},
[]string{"org"},
),
// TODO: partition on rule group as well as tenant, similar to loki|cortex.
GroupRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_group_rules",
Help: "The number of alert rules that are scheduled, both active and paused.",
},
[]string{"org", "state"},
),
Groups: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_groups",
Help: "The number of alert rule groups",
},
[]string{"org"},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_periodic_duration_seconds",
Help: "The time taken to run the scheduler.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulableAlertRules: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules",
Help: "The number of alert rules that could be considered for evaluation at the next tick.",
},
),
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules_hash",
Help: "A hash of the alert rules that could be considered for evaluation at the next tick.",
}),
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_query_alert_rules_duration_seconds",
Help: "The time taken to fetch alert rules from the database.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
Ticker: ticker.NewMetrics(r, "alerting"),
EvaluationMissed: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_rule_evaluations_missed_total",
Help: "The total number of rule evaluations missed due to a slow rule evaluation.",
},
[]string{"org", "name"},
),
}
}