From 07915703feee4066b5b925d7b1038900e3f482a8 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 6 Dec 2023 19:12:01 +0000 Subject: [PATCH] Revert "Alerting: Attempt to retry retryable errors" (#79158) Revert "Alerting: Attempt to retry retryable errors (#79037)" This reverts commit 3e51cf09491e19442cdceb8e84c5fb3b9ef17e2c. --- pkg/services/ngalert/eval/eval.go | 29 -------- pkg/services/ngalert/eval/eval_test.go | 65 ----------------- pkg/services/ngalert/schedule/schedule.go | 70 +++++++------------ .../ngalert/schedule/schedule_unit_test.go | 35 +++++----- 4 files changed, 43 insertions(+), 156 deletions(-) diff --git a/pkg/services/ngalert/eval/eval.go b/pkg/services/ngalert/eval/eval.go index 6766c2e4c58..d079022b6ac 100644 --- a/pkg/services/ngalert/eval/eval.go +++ b/pkg/services/ngalert/eval/eval.go @@ -157,23 +157,6 @@ func (evalResults Results) HasErrors() bool { return false } -// HasNonRetryableErrors returns true if we have at least 1 result with: -// 1. A `State` of `Error` -// 2. The `Error` attribute is not nil -// 3. The `Error` type is of `&invalidEvalResultFormatError` -// Our thinking with this approach, is that we don't want to retry errors that have relation with invalid alert definition format. -func (evalResults Results) HasNonRetryableErrors() bool { - for _, r := range evalResults { - if r.State == Error && r.Error != nil { - var nonRetryableError *invalidEvalResultFormatError - if errors.As(r.Error, &nonRetryableError) { - return true - } - } - } - return false -} - // HasErrors returns true when Results contains at least one element and all elements are errors func (evalResults Results) IsError() bool { for _, r := range evalResults { @@ -194,18 +177,6 @@ func (evalResults Results) IsNoData() bool { return true } -// Error returns the aggregated `error` of all results of which state is `Error`. -func (evalResults Results) Error() error { - var errs []error - for _, result := range evalResults { - if result.State == Error && result.Error != nil { - errs = append(errs, result.Error) - } - } - - return errors.Join(errs...) -} - // Result contains the evaluated State of an alert instance // identified by its labels. type Result struct { diff --git a/pkg/services/ngalert/eval/eval_test.go b/pkg/services/ngalert/eval/eval_test.go index cb456be06cd..7820036edf6 100644 --- a/pkg/services/ngalert/eval/eval_test.go +++ b/pkg/services/ngalert/eval/eval_test.go @@ -2,7 +2,6 @@ package eval import ( "context" - "errors" "fmt" "math/rand" "testing" @@ -770,70 +769,6 @@ func TestEvaluateRaw(t *testing.T) { }) } -func TestResults_HasNonRetryableErrors(t *testing.T) { - tc := []struct { - name string - eval Results - expected bool - }{ - { - name: "with non-retryable errors", - eval: Results{ - { - State: Error, - Error: &invalidEvalResultFormatError{refID: "A", reason: "unable to get frame row length", err: errors.New("weird error")}, - }, - }, - expected: true, - }, - { - name: "with retryable errors", - eval: Results{ - { - State: Error, - Error: errors.New("some weird error"), - }, - }, - expected: false, - }, - } - - for _, tt := range tc { - t.Run(tt.name, func(t *testing.T) { - require.Equal(t, tt.expected, tt.eval.HasNonRetryableErrors()) - }) - } -} - -func TestResults_Error(t *testing.T) { - tc := []struct { - name string - eval Results - expected string - }{ - { - name: "with non-retryable errors", - eval: Results{ - { - State: Error, - Error: &invalidEvalResultFormatError{refID: "A", reason: "unable to get frame row length", err: errors.New("weird error")}, - }, - { - State: Error, - Error: errors.New("unable to get a data frame"), - }, - }, - expected: "invalid format of evaluation results for the alert definition A: unable to get frame row length: weird error\nunable to get a data frame", - }, - } - - for _, tt := range tc { - t.Run(tt.name, func(t *testing.T) { - require.Equal(t, tt.expected, tt.eval.Error().Error()) - }) - } -} - type fakeExpressionService struct { hook func(ctx context.Context, now time.Time, pipeline expr.DataPipeline) (*backend.QueryDataResponse, error) } diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index c7d9955272b..13ac4d04819 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -34,9 +34,6 @@ type ScheduleService interface { Run(context.Context) error } -// retryDelay represents how long to wait between each failed rule evaluation. -const retryDelay = 1 * time.Second - // AlertsSender is an interface for a service that is responsible for sending notifications to the end-user. // //go:generate mockery --name AlertsSender --structname AlertsSenderMock --inpackage --filename alerts_sender_mock.go --with-expecter @@ -377,7 +374,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR notify(states) } - evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error { + evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span) { logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx) start := sch.clock.Now() @@ -401,35 +398,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR if err != nil || results.HasErrors() { evalTotalFailures.Inc() - - // Only retry (return errors) if this isn't the last attempt, otherwise skip these return operations. - if retry { - // The only thing that can return non-nil `err` from ruleEval.Evaluate is the server side expression pipeline. - // This includes transport errors such as transient network errors. - if err != nil { - span.SetStatus(codes.Error, "rule evaluation failed") - span.RecordError(err) - return fmt.Errorf("server side expressions pipeline returned an error: %w", err) - } - - // If the pipeline executed successfully but have other types of errors that can be retryable, we should do so. - if !results.HasNonRetryableErrors() { - span.SetStatus(codes.Error, "rule evaluation failed") - span.RecordError(err) - return fmt.Errorf("the result-set has errors that can be retried: %w", results.Error()) - } - } - - // If results is nil, we assume that the error must be from the SSE pipeline (ruleEval.Evaluate) which is the only code that can actually return an `err`. if results == nil { results = append(results, eval.NewResultFromError(err, e.scheduledAt, dur)) } - - // If err is nil, we assume that the SSS pipeline succeeded and that the error must be embedded in the results. if err == nil { - err = results.Error() + for _, result := range results { + if result.Error != nil { + err = errors.Join(err, result.Error) + } + } } - span.SetStatus(codes.Error, "rule evaluation failed") span.RecordError(err) } else { @@ -440,7 +418,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR } if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task. logger.Debug("Skip updating the state because the context has been cancelled") - return nil + return } start = sch.clock.Now() processedStates := sch.stateManager.ProcessEvalResults( @@ -462,8 +440,18 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR sch.alertsSender.Send(ctx, key, alerts) } sendDuration.Observe(sch.clock.Now().Sub(start).Seconds()) + } - return nil + retryIfError := func(f func(attempt int64) error) error { + var attempt int64 + var err error + for attempt = 0; attempt < sch.maxAttempts; attempt++ { + err = f(attempt) + if err == nil { + return nil + } + } + return err } evalRunning := false @@ -499,7 +487,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR sch.evalApplied(key, ctx.scheduledAt) }() - for attempt := int64(1); attempt <= sch.maxAttempts; attempt++ { + err := retryIfError(func(attempt int64) error { isPaused := ctx.rule.IsPaused f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint() // Do not clean up state if the eval loop has just started. @@ -518,7 +506,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR currentFingerprint = f if isPaused { logger.Debug("Skip rule evaluation because it is paused") - return + return nil } fpStr := currentFingerprint.String() @@ -530,21 +518,15 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR attribute.String("rule_fingerprint", fpStr), attribute.String("tick", utcTick), )) + defer span.End() - retry := attempt < sch.maxAttempts - err := evaluate(tracingCtx, f, attempt, ctx, span, retry) - // This is extremely confusing - when we exhaust all retry attempts, or we have no retryable errors - // we return nil - so technically, this is meaningless to know whether the evaluation has errors or not. - span.End() - if err == nil { - return - } - - logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt) - time.Sleep(retryDelay) + evaluate(tracingCtx, f, attempt, ctx, span) + return nil + }) + if err != nil { + logger.Error("Evaluation failed after all retries", "error", err) } }() - case <-grafanaCtx.Done(): // clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted if errors.Is(grafanaCtx.Err(), errRuleDeleted) { diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index f0613be1b18..5e362580d1e 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -662,7 +662,6 @@ func TestSchedule_ruleRoutine(t *testing.T) { sender.EXPECT().Send(mock.Anything, rule.GetKey(), mock.Anything).Return() sch, ruleStore, _, reg := createSchedule(evalAppliedChan, &sender) - sch.maxAttempts = 3 ruleStore.PutRule(context.Background(), rule) go func() { @@ -683,28 +682,28 @@ func TestSchedule_ruleRoutine(t *testing.T) { expectedMetric := fmt.Sprintf( `# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule. # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 3 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 3 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 - grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 3 + grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures. # TYPE grafana_alerting_rule_evaluation_failures_total counter - grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 3 + grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1 # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. # TYPE grafana_alerting_rule_evaluations_total counter - grafana_alerting_rule_evaluations_total{org="%[1]d"} 3 + grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 # HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule. # TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1