From ad7f8042553ac1a260f49907e4a42ade8c5eb741 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Fri, 12 Apr 2024 16:20:46 +0200 Subject: [PATCH] Alerting: Fix evaluation metrics to not count retries (#85873) * Change evaluation metrics to only count once per eval, and add new metrics. * Cosmetic: Move eval total Inc() to orginal place. --- pkg/services/ngalert/metrics/scheduler.go | 20 ++++++ pkg/services/ngalert/schedule/alert_rule.go | 23 +++++-- .../ngalert/schedule/alert_rule_test.go | 67 +++++++++++++------ 3 files changed, 85 insertions(+), 25 deletions(-) diff --git a/pkg/services/ngalert/metrics/scheduler.go b/pkg/services/ngalert/metrics/scheduler.go index 35b56b95732..70056ea86c8 100644 --- a/pkg/services/ngalert/metrics/scheduler.go +++ b/pkg/services/ngalert/metrics/scheduler.go @@ -18,6 +18,8 @@ type Scheduler struct { EvalTotal *prometheus.CounterVec EvalFailures *prometheus.CounterVec EvalDuration *prometheus.HistogramVec + EvalAttemptTotal *prometheus.CounterVec + EvalAttemptFailures *prometheus.CounterVec ProcessDuration *prometheus.HistogramVec SendDuration *prometheus.HistogramVec SimpleNotificationRules *prometheus.GaugeVec @@ -72,6 +74,24 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { }, []string{"org"}, ), + EvalAttemptTotal: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_evaluation_attempts_total", + Help: "The total number of rule evaluation attempts.", + }, + []string{"org"}, + ), + EvalAttemptFailures: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_evaluation_attempt_failures_total", + Help: "The total number of rule evaluation attempt failures.", + }, + []string{"org"}, + ), ProcessDuration: promauto.With(r).NewHistogramVec( prometheus.HistogramOpts{ Namespace: Namespace, diff --git a/pkg/services/ngalert/schedule/alert_rule.go b/pkg/services/ngalert/schedule/alert_rule.go index 3b9e4794846..6ba1d57d9ef 100644 --- a/pkg/services/ngalert/schedule/alert_rule.go +++ b/pkg/services/ngalert/schedule/alert_rule.go @@ -227,10 +227,16 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error { } func() { + orgID := fmt.Sprint(key.OrgID) + evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID) + evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID) + evalRunning = true + evalStart := a.clock.Now() defer func() { evalRunning = false a.evalApplied(key, ctx.scheduledAt) + evalDuration.Observe(a.clock.Now().Sub(evalStart).Seconds()) }() for attempt := int64(1); attempt <= a.maxAttempts; attempt++ { @@ -255,6 +261,11 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error { return } + // Only increment evaluation counter once, not per-retry. + if attempt == 1 { + evalTotal.Inc() + } + fpStr := currentFingerprint.String() utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano) tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes( @@ -312,8 +323,8 @@ func (a *alertRule) Run(key ngmodels.AlertRuleKey) error { func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *Evaluation, span trace.Span, retry bool) error { orgID := fmt.Sprint(key.OrgID) - evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID) - evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID) + evalAttemptTotal := a.metrics.EvalAttemptTotal.WithLabelValues(orgID) + evalAttemptFailures := a.metrics.EvalAttemptFailures.WithLabelValues(orgID) evalTotalFailures := a.metrics.EvalFailures.WithLabelValues(orgID) processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID) sendDuration := a.metrics.SendDuration.WithLabelValues(orgID) @@ -336,8 +347,7 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f } } - evalTotal.Inc() - evalDuration.Observe(dur.Seconds()) + evalAttemptTotal.Inc() if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task. span.SetStatus(codes.Error, "rule evaluation cancelled") @@ -346,7 +356,7 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f } if err != nil || results.HasErrors() { - evalTotalFailures.Inc() + evalAttemptFailures.Inc() // Only retry (return errors) if this isn't the last attempt, otherwise skip these return operations. if retry { @@ -364,6 +374,9 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f span.RecordError(err) return fmt.Errorf("the result-set has errors that can be retried: %w", results.Error()) } + } else { + // Only count the final attempt as a failure. + evalTotalFailures.Inc() } // If results is nil, we assume that the error must be from the SSE pipeline (ruleEval.Evaluate) which is the only code that can actually return an `err`. diff --git a/pkg/services/ngalert/schedule/alert_rule_test.go b/pkg/services/ngalert/schedule/alert_rule_test.go index 3fbff033dea..f910376cfba 100644 --- a/pkg/services/ngalert/schedule/alert_rule_test.go +++ b/pkg/services/ngalert/schedule/alert_rule_test.go @@ -369,6 +369,13 @@ func TestRuleRoutine(t *testing.T) { # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. # TYPE grafana_alerting_rule_evaluations_total counter grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_evaluation_attempt_failures_total The total number of rule evaluation attempt failures. + # TYPE grafana_alerting_rule_evaluation_attempt_failures_total counter + grafana_alerting_rule_evaluation_attempt_failures_total{org="%[1]d"} 0 + # HELP grafana_alerting_rule_evaluation_attempts_total The total number of rule evaluation attempts. + # TYPE grafana_alerting_rule_evaluation_attempts_total counter + grafana_alerting_rule_evaluation_attempts_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule. # TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 @@ -407,7 +414,14 @@ func TestRuleRoutine(t *testing.T) { grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1 `, rule.OrgID) - err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds") + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), + "grafana_alerting_rule_evaluation_duration_seconds", + "grafana_alerting_rule_evaluations_total", + "grafana_alerting_rule_evaluation_failures_total", + "grafana_alerting_rule_evaluation_attempts_total", + "grafana_alerting_rule_evaluation_attempt_failures_total", + "grafana_alerting_rule_process_evaluation_duration_seconds", + "grafana_alerting_rule_send_alerts_duration_seconds") require.NoError(t, err) }) }) @@ -570,33 +584,39 @@ func TestRuleRoutine(t *testing.T) { waitForTimeChannel(t, evalAppliedChan) - t.Run("it should increase failure counter", func(t *testing.T) { + t.Run("it should increase failure counter by 1 and attempt failure counter by 3", func(t *testing.T) { // duration metric has 0 values because of mocked clock that do not advance expectedMetric := fmt.Sprintf( `# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule. # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 3 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 - grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 3 + grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures. # TYPE grafana_alerting_rule_evaluation_failures_total counter - grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 3 + grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1 # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. # TYPE grafana_alerting_rule_evaluations_total counter - grafana_alerting_rule_evaluations_total{org="%[1]d"} 3 + grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_evaluation_attempt_failures_total The total number of rule evaluation attempt failures. + # TYPE grafana_alerting_rule_evaluation_attempt_failures_total counter + grafana_alerting_rule_evaluation_attempt_failures_total{org="%[1]d"} 3 + # HELP grafana_alerting_rule_evaluation_attempts_total The total number of rule evaluation attempts. + # TYPE grafana_alerting_rule_evaluation_attempts_total counter + grafana_alerting_rule_evaluation_attempts_total{org="%[1]d"} 3 # HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule. # TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 @@ -635,7 +655,14 @@ func TestRuleRoutine(t *testing.T) { grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1 `, rule.OrgID) - err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds") + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), + "grafana_alerting_rule_evaluation_duration_seconds", + "grafana_alerting_rule_evaluations_total", + "grafana_alerting_rule_evaluation_failures_total", + "grafana_alerting_rule_evaluation_attempts_total", + "grafana_alerting_rule_evaluation_attempt_failures_total", + "grafana_alerting_rule_process_evaluation_duration_seconds", + "grafana_alerting_rule_send_alerts_duration_seconds") require.NoError(t, err) })