mirror of
https://github.com/grafana/grafana.git
synced 2024-11-29 04:04:00 -06:00
Alerting: Fix evaluation metrics to not count retries (#85873)
* Change evaluation metrics to only count once per eval, and add new metrics. * Cosmetic: Move eval total Inc() to orginal place.
This commit is contained in:
parent
f1f02207f2
commit
ad7f804255
@ -18,6 +18,8 @@ type Scheduler struct {
|
|||||||
EvalTotal *prometheus.CounterVec
|
EvalTotal *prometheus.CounterVec
|
||||||
EvalFailures *prometheus.CounterVec
|
EvalFailures *prometheus.CounterVec
|
||||||
EvalDuration *prometheus.HistogramVec
|
EvalDuration *prometheus.HistogramVec
|
||||||
|
EvalAttemptTotal *prometheus.CounterVec
|
||||||
|
EvalAttemptFailures *prometheus.CounterVec
|
||||||
ProcessDuration *prometheus.HistogramVec
|
ProcessDuration *prometheus.HistogramVec
|
||||||
SendDuration *prometheus.HistogramVec
|
SendDuration *prometheus.HistogramVec
|
||||||
SimpleNotificationRules *prometheus.GaugeVec
|
SimpleNotificationRules *prometheus.GaugeVec
|
||||||
@ -72,6 +74,24 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|||||||
},
|
},
|
||||||
[]string{"org"},
|
[]string{"org"},
|
||||||
),
|
),
|
||||||
|
EvalAttemptTotal: promauto.With(r).NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_evaluation_attempts_total",
|
||||||
|
Help: "The total number of rule evaluation attempts.",
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
EvalAttemptFailures: promauto.With(r).NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_evaluation_attempt_failures_total",
|
||||||
|
Help: "The total number of rule evaluation attempt failures.",
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
ProcessDuration: promauto.With(r).NewHistogramVec(
|
ProcessDuration: promauto.With(r).NewHistogramVec(
|
||||||
prometheus.HistogramOpts{
|
prometheus.HistogramOpts{
|
||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
|
@ -227,10 +227,16 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func() {
|
func() {
|
||||||
|
orgID := fmt.Sprint(key.OrgID)
|
||||||
|
evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
|
||||||
|
evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
|
||||||
|
|
||||||
evalRunning = true
|
evalRunning = true
|
||||||
|
evalStart := a.clock.Now()
|
||||||
defer func() {
|
defer func() {
|
||||||
evalRunning = false
|
evalRunning = false
|
||||||
a.evalApplied(key, ctx.scheduledAt)
|
a.evalApplied(key, ctx.scheduledAt)
|
||||||
|
evalDuration.Observe(a.clock.Now().Sub(evalStart).Seconds())
|
||||||
}()
|
}()
|
||||||
|
|
||||||
for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
|
for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
|
||||||
@ -255,6 +261,11 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only increment evaluation counter once, not per-retry.
|
||||||
|
if attempt == 1 {
|
||||||
|
evalTotal.Inc()
|
||||||
|
}
|
||||||
|
|
||||||
fpStr := currentFingerprint.String()
|
fpStr := currentFingerprint.String()
|
||||||
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
|
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
|
||||||
tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
|
tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
|
||||||
@ -312,8 +323,8 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error {
|
|||||||
|
|
||||||
func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *Evaluation, span trace.Span, retry bool) error {
|
func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *Evaluation, span trace.Span, retry bool) error {
|
||||||
orgID := fmt.Sprint(key.OrgID)
|
orgID := fmt.Sprint(key.OrgID)
|
||||||
evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
|
evalAttemptTotal := a.metrics.EvalAttemptTotal.WithLabelValues(orgID)
|
||||||
evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
|
evalAttemptFailures := a.metrics.EvalAttemptFailures.WithLabelValues(orgID)
|
||||||
evalTotalFailures := a.metrics.EvalFailures.WithLabelValues(orgID)
|
evalTotalFailures := a.metrics.EvalFailures.WithLabelValues(orgID)
|
||||||
processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID)
|
processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID)
|
||||||
sendDuration := a.metrics.SendDuration.WithLabelValues(orgID)
|
sendDuration := a.metrics.SendDuration.WithLabelValues(orgID)
|
||||||
@ -336,8 +347,7 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
evalTotal.Inc()
|
evalAttemptTotal.Inc()
|
||||||
evalDuration.Observe(dur.Seconds())
|
|
||||||
|
|
||||||
if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
|
if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
|
||||||
span.SetStatus(codes.Error, "rule evaluation cancelled")
|
span.SetStatus(codes.Error, "rule evaluation cancelled")
|
||||||
@ -346,7 +356,7 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f
|
|||||||
}
|
}
|
||||||
|
|
||||||
if err != nil || results.HasErrors() {
|
if err != nil || results.HasErrors() {
|
||||||
evalTotalFailures.Inc()
|
evalAttemptFailures.Inc()
|
||||||
|
|
||||||
// Only retry (return errors) if this isn't the last attempt, otherwise skip these return operations.
|
// Only retry (return errors) if this isn't the last attempt, otherwise skip these return operations.
|
||||||
if retry {
|
if retry {
|
||||||
@ -364,6 +374,9 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f
|
|||||||
span.RecordError(err)
|
span.RecordError(err)
|
||||||
return fmt.Errorf("the result-set has errors that can be retried: %w", results.Error())
|
return fmt.Errorf("the result-set has errors that can be retried: %w", results.Error())
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Only count the final attempt as a failure.
|
||||||
|
evalTotalFailures.Inc()
|
||||||
}
|
}
|
||||||
|
|
||||||
// If results is nil, we assume that the error must be from the SSE pipeline (ruleEval.Evaluate) which is the only code that can actually return an `err`.
|
// If results is nil, we assume that the error must be from the SSE pipeline (ruleEval.Evaluate) which is the only code that can actually return an `err`.
|
||||||
|
@ -369,6 +369,13 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
||||||
# TYPE grafana_alerting_rule_evaluations_total counter
|
# TYPE grafana_alerting_rule_evaluations_total counter
|
||||||
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
||||||
|
# HELP grafana_alerting_rule_evaluation_attempt_failures_total The total number of rule evaluation attempt failures.
|
||||||
|
# TYPE grafana_alerting_rule_evaluation_attempt_failures_total counter
|
||||||
|
grafana_alerting_rule_evaluation_attempt_failures_total{org="%[1]d"} 0
|
||||||
|
# HELP grafana_alerting_rule_evaluation_attempts_total The total number of rule evaluation attempts.
|
||||||
|
# TYPE grafana_alerting_rule_evaluation_attempts_total counter
|
||||||
|
grafana_alerting_rule_evaluation_attempts_total{org="%[1]d"} 1
|
||||||
|
|
||||||
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
||||||
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
||||||
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
@ -407,7 +414,14 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
||||||
`, rule.OrgID)
|
`, rule.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric),
|
||||||
|
"grafana_alerting_rule_evaluation_duration_seconds",
|
||||||
|
"grafana_alerting_rule_evaluations_total",
|
||||||
|
"grafana_alerting_rule_evaluation_failures_total",
|
||||||
|
"grafana_alerting_rule_evaluation_attempts_total",
|
||||||
|
"grafana_alerting_rule_evaluation_attempt_failures_total",
|
||||||
|
"grafana_alerting_rule_process_evaluation_duration_seconds",
|
||||||
|
"grafana_alerting_rule_send_alerts_duration_seconds")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@ -570,33 +584,39 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
|
|
||||||
waitForTimeChannel(t, evalAppliedChan)
|
waitForTimeChannel(t, evalAppliedChan)
|
||||||
|
|
||||||
t.Run("it should increase failure counter", func(t *testing.T) {
|
t.Run("it should increase failure counter by 1 and attempt failure counter by 3", func(t *testing.T) {
|
||||||
// duration metric has 0 values because of mocked clock that do not advance
|
// duration metric has 0 values because of mocked clock that do not advance
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
|
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
|
||||||
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 3
|
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||||
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
|
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
|
||||||
# TYPE grafana_alerting_rule_evaluation_failures_total counter
|
# TYPE grafana_alerting_rule_evaluation_failures_total counter
|
||||||
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 3
|
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
|
||||||
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
||||||
# TYPE grafana_alerting_rule_evaluations_total counter
|
# TYPE grafana_alerting_rule_evaluations_total counter
|
||||||
grafana_alerting_rule_evaluations_total{org="%[1]d"} 3
|
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
||||||
|
# HELP grafana_alerting_rule_evaluation_attempt_failures_total The total number of rule evaluation attempt failures.
|
||||||
|
# TYPE grafana_alerting_rule_evaluation_attempt_failures_total counter
|
||||||
|
grafana_alerting_rule_evaluation_attempt_failures_total{org="%[1]d"} 3
|
||||||
|
# HELP grafana_alerting_rule_evaluation_attempts_total The total number of rule evaluation attempts.
|
||||||
|
# TYPE grafana_alerting_rule_evaluation_attempts_total counter
|
||||||
|
grafana_alerting_rule_evaluation_attempts_total{org="%[1]d"} 3
|
||||||
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
||||||
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
||||||
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
@ -635,7 +655,14 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
||||||
`, rule.OrgID)
|
`, rule.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric),
|
||||||
|
"grafana_alerting_rule_evaluation_duration_seconds",
|
||||||
|
"grafana_alerting_rule_evaluations_total",
|
||||||
|
"grafana_alerting_rule_evaluation_failures_total",
|
||||||
|
"grafana_alerting_rule_evaluation_attempts_total",
|
||||||
|
"grafana_alerting_rule_evaluation_attempt_failures_total",
|
||||||
|
"grafana_alerting_rule_process_evaluation_duration_seconds",
|
||||||
|
"grafana_alerting_rule_send_alerts_duration_seconds")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user