Alerting: Add metric counting rule groups per org (#80669)

* Refactor, fix bad map hint

* Count groups per org
This commit is contained in:
Alexander Weaver 2024-01-16 16:35:56 -06:00 committed by GitHub
parent 6b37a887d5
commit 3c796ecc8f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 4 deletions

View File

@ -21,6 +21,7 @@ type Scheduler struct {
ProcessDuration *prometheus.HistogramVec
SendDuration *prometheus.HistogramVec
GroupRules *prometheus.GaugeVec
Groups *prometheus.GaugeVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
@ -100,6 +101,15 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org", "state"},
),
Groups: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_groups",
Help: "The number of alert rule groups",
},
[]string{"org"},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,

View File

@ -204,21 +204,34 @@ type readyToRunItem struct {
}
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
orgs := make(map[int64]int64, len(alertRules))
orgsPaused := make(map[int64]int64, len(alertRules))
rulesPerOrg := make(map[int64]int64) // orgID -> count
orgsPaused := make(map[int64]int64) // orgID -> count
groupsPerOrg := make(map[int64]map[string]struct{}) // orgID -> set of groups
for _, rule := range alertRules {
orgs[rule.OrgID]++
rulesPerOrg[rule.OrgID]++
if rule.IsPaused {
orgsPaused[rule.OrgID]++
}
orgGroups, ok := groupsPerOrg[rule.OrgID]
if !ok {
orgGroups = make(map[string]struct{})
groupsPerOrg[rule.OrgID] = orgGroups
}
orgGroups[rule.RuleGroup] = struct{}{}
}
for orgID, numRules := range orgs {
for orgID, numRules := range rulesPerOrg {
numRulesPaused := orgsPaused[orgID]
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused))
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused))
}
for orgID, groups := range groupsPerOrg {
sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groups)))
}
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))