mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Move rule_groups_rules
metric from State to Scheduler. (#63144)
The `rule_groups_rules` metric is currently defined and computed by `State`. It makes more sense for this metric to be computed off of the configured rule set, not based on the rule evaluation state. There could be an edge condition where a rule does not have a state yet, and so is uncounted. Additionally, we would like this metric (and others), to have a `rule_group` label, and this is much easier to achieve if the metric is produced from the `Scheduler` package.
This commit is contained in:
parent
90f8959d3c
commit
4d1a2c3370
@ -13,6 +13,7 @@ type Scheduler struct {
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.HistogramVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
SchedulePeriodicDuration prometheus.Histogram
|
||||
SchedulableAlertRules prometheus.Gauge
|
||||
SchedulableAlertRulesHash prometheus.Gauge
|
||||
@ -62,6 +63,16 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
// TODO: partition on rule group as well as tenant, similar to loki|cortex.
|
||||
GroupRules: promauto.With(r).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
|
@ -6,23 +6,11 @@ import (
|
||||
)
|
||||
|
||||
type State struct {
|
||||
GroupRules *prometheus.GaugeVec
|
||||
AlertState *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func NewStateMetrics(r prometheus.Registerer) *State {
|
||||
return &State{
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
GroupRules: promauto.With(r).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_group_rules",
|
||||
Help: "The number of rules.",
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
AlertState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
|
@ -177,8 +177,7 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
|
||||
}
|
||||
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
|
||||
alertRules, _ := sch.schedulableAlertRules.all()
|
||||
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
|
||||
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
|
||||
sch.updateRulesMetrics(alertRules)
|
||||
}
|
||||
|
||||
func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
@ -209,6 +208,21 @@ type readyToRunItem struct {
|
||||
evaluation
|
||||
}
|
||||
|
||||
func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) {
|
||||
orgs := make(map[int64]int64)
|
||||
for _, rule := range alertRules {
|
||||
orgs[rule.OrgID]++
|
||||
}
|
||||
for org, numRules := range orgs {
|
||||
sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules))
|
||||
}
|
||||
|
||||
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
||||
// scheduled as rules could be removed before we get a chance to evaluate them.
|
||||
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
|
||||
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
|
||||
}
|
||||
|
||||
func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.Group, tick time.Time) ([]readyToRunItem, map[ngmodels.AlertRuleKey]struct{}) {
|
||||
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
||||
|
||||
@ -223,10 +237,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
// so, at the end, the remaining registered alert rules are the deleted ones
|
||||
registeredDefinitions := sch.registry.keyMap()
|
||||
|
||||
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
|
||||
// scheduled as rules could be removed before we get a chance to evaluate them.
|
||||
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
|
||||
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
|
||||
sch.updateRulesMetrics(alertRules)
|
||||
|
||||
readyToRun := make([]readyToRunItem, 0)
|
||||
missingFolder := make(map[string][]string)
|
||||
|
@ -40,7 +40,8 @@ type evalAppliedInfo struct {
|
||||
|
||||
func TestProcessTicks(t *testing.T) {
|
||||
testTracer := tracing.InitializeTracerForTest()
|
||||
testMetrics := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
testMetrics := metrics.NewNGAlert(reg)
|
||||
ctx := context.Background()
|
||||
dispatcherGroup, ctx := errgroup.WithContext(ctx)
|
||||
|
||||
@ -113,6 +114,17 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
// add alert rule under main org with three base intervals
|
||||
alertRule2 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(3*cfg.BaseInterval), models.WithTitle("rule-2"))()
|
||||
ruleStore.PutRule(ctx, alertRule2)
|
||||
@ -128,6 +140,17 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 2
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 3rd tick two alert rules should be evaluated", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick)
|
||||
@ -172,6 +195,17 @@ func TestProcessTicks(t *testing.T) {
|
||||
assertStopRun(t, stopAppliedCh, alertRule1.GetKey())
|
||||
})
|
||||
|
||||
t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) {
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_group_rules The number of rules.
|
||||
# TYPE grafana_alerting_rule_group_rules gauge
|
||||
grafana_alerting_rule_group_rules{org="%[1]d"} 1
|
||||
`, alertRule1.OrgID)
|
||||
|
||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules")
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) {
|
||||
tick = tick.Add(cfg.BaseInterval)
|
||||
|
||||
|
@ -2,7 +2,6 @@ package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/url"
|
||||
"strings"
|
||||
@ -282,8 +281,7 @@ func (c *cache) recordMetrics(metrics *metrics.State) {
|
||||
eval.Error: 0,
|
||||
}
|
||||
|
||||
for org, orgMap := range c.states {
|
||||
metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(len(orgMap)))
|
||||
for _, orgMap := range c.states {
|
||||
for _, rule := range orgMap {
|
||||
for _, state := range rule.states {
|
||||
n := ct[state.State]
|
||||
|
Loading…
Reference in New Issue
Block a user