Alerting: Move rule_groups_rules metric from State to Scheduler. (#63144)

The `rule_groups_rules` metric is currently defined and computed by `State`.
It makes more sense for this metric to be computed off of the configured rule
set, not based on the rule evaluation state. There could be an edge condition
where a rule does not have a state yet, and so is uncounted.

Additionally, we would like this metric (and others), to have a `rule_group`
label, and this is much easier to achieve if the metric is produced from the
`Scheduler` package.
This commit is contained in:
Steve Simpson 2023-02-09 17:05:19 +01:00 committed by GitHub
parent 90f8959d3c
commit 4d1a2c3370
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 64 additions and 22 deletions

View File

@ -13,6 +13,7 @@ type Scheduler struct {
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.HistogramVec
GroupRules *prometheus.GaugeVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
@ -62,6 +63,16 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
// TODO: partition on rule group as well as tenant, similar to loki|cortex.
GroupRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_group_rules",
Help: "The number of rules.",
},
[]string{"org"},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,

View File

@ -6,23 +6,11 @@ import (
)
type State struct {
GroupRules *prometheus.GaugeVec
AlertState *prometheus.GaugeVec
}
func NewStateMetrics(r prometheus.Registerer) *State {
return &State{
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
GroupRules: promauto.With(r).NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_group_rules",
Help: "The number of rules.",
},
[]string{"org"},
),
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,

View File

@ -177,8 +177,7 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
}
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
alertRules, _ := sch.schedulableAlertRules.all()
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
sch.updateRulesMetrics(alertRules)
}
func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
@ -209,6 +208,21 @@ type readyToRunItem struct {
evaluation
}
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
orgs := make(map[int64]int64)
for _, rule := range alertRules {
orgs[rule.OrgID]++
}
for org, numRules := range orgs {
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
}
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
}
func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.Group, tick time.Time) ([]readyToRunItem, map[ngmodels.AlertRuleKey]struct{}) {
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
@ -223,10 +237,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
// so, at the end, the remaining registered alert rules are the deleted ones
registeredDefinitions := sch.registry.keyMap()
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
sch.updateRulesMetrics(alertRules)
readyToRun := make([]readyToRunItem, 0)
missingFolder := make(map[string][]string)

View File

@ -40,7 +40,8 @@ type evalAppliedInfo struct {
func TestProcessTicks(t *testing.T) {
testTracer := tracing.InitializeTracerForTest()
testMetrics := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
reg := prometheus.NewPedanticRegistry()
testMetrics := metrics.NewNGAlert(reg)
ctx := context.Background()
dispatcherGroup, ctx := errgroup.WithContext(ctx)
@ -113,6 +114,17 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
// add alert rule under main org with three base intervals
alertRule2 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(3*cfg.BaseInterval), models.WithTitle("rule-2"))()
ruleStore.PutRule(ctx, alertRule2)
@ -128,6 +140,17 @@ func TestProcessTicks(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
})
t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 2
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 3rd tick two alert rules should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
@ -172,6 +195,17 @@ func TestProcessTicks(t *testing.T) {
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
})
t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_group_rules The number of rules.
# TYPE grafana_alerting_rule_group_rules gauge
grafana_alerting_rule_group_rules{org="%[1]d"} 1
`, alertRule1.OrgID)
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
require.NoError(t, err)
})
t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
tick = tick.Add(cfg.BaseInterval)

View File

@ -2,7 +2,6 @@ package state
import (
"context"
"fmt"
"math"
"net/url"
"strings"
@ -282,8 +281,7 @@ func (c *cache) recordMetrics(metrics *metrics.State) {
eval.Error: 0,
}
for org, orgMap := range c.states {
metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
for _, orgMap := range c.states {
for _, rule := range orgMap {
for _, state := range rule.states {
n := ct[state.State]