diff --git a/pkg/services/ngalert/metrics/scheduler.go b/pkg/services/ngalert/metrics/scheduler.go index d3f045a4640..172fda5902b 100644 --- a/pkg/services/ngalert/metrics/scheduler.go +++ b/pkg/services/ngalert/metrics/scheduler.go @@ -21,6 +21,7 @@ type Scheduler struct { ProcessDuration *prometheus.HistogramVec SendDuration *prometheus.HistogramVec GroupRules *prometheus.GaugeVec + Groups *prometheus.GaugeVec SchedulePeriodicDuration prometheus.Histogram SchedulableAlertRules prometheus.Gauge SchedulableAlertRulesHash prometheus.Gauge @@ -100,6 +101,15 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { }, []string{"org", "state"}, ), + Groups: promauto.With(r).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_groups", + Help: "The number of alert rule groups", + }, + []string{"org"}, + ), SchedulePeriodicDuration: promauto.With(r).NewHistogram( prometheus.HistogramOpts{ Namespace: Namespace, diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index f36ba8158c2..0104d3bcdad 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -204,21 +204,34 @@ type readyToRunItem struct { } func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) { - orgs := make(map[int64]int64, len(alertRules)) - orgsPaused := make(map[int64]int64, len(alertRules)) + rulesPerOrg := make(map[int64]int64) // orgID -> count + orgsPaused := make(map[int64]int64) // orgID -> count + groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups for _, rule := range alertRules { - orgs[rule.OrgID]++ + rulesPerOrg[rule.OrgID]++ + if rule.IsPaused { orgsPaused[rule.OrgID]++ } + + orgGroups, ok := groupsPerOrg[rule.OrgID] + if !ok { + orgGroups = make(map[string]struct{}) + groupsPerOrg[rule.OrgID] = orgGroups + } + orgGroups[rule.RuleGroup] = struct{}{} } - for orgID, numRules := range orgs { + for orgID, numRules := range rulesPerOrg { numRulesPaused := orgsPaused[orgID] sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused)) sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused)) } + for orgID, groups := range groupsPerOrg { + sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groups))) + } + // While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be // scheduled as rules could be removed before we get a chance to evaluate them. sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))