Alerting: Extract large closures in ruleRoutine (#84035)

* extract notify * extract resetState * move evaluate metrics inside evaluate * split out evaluate
2025-02-25 18:55:37 -06:00 · 2024-03-06 16:39:23 -06:00
parent 7a171fd14a
commit 201f5d3ac9
1 changed files with 119 additions and 120 deletions
--- a/pkg/services/ngalert/schedule/alert_rule.go
+++ b/pkg/services/ngalert/schedule/alert_rule.go
@@ -180,12 +180,122 @@ func (a *alertRuleInfo) stop(reason error) {
 	a.stopFn(reason)
 }

-//nolint:gocyclo
 func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
 	grafanaCtx := ngmodels.WithRuleKey(a.ctx, key)
 	logger := a.logger.FromContext(grafanaCtx)
 	logger.Debug("Alert rule routine started")

+	evalRunning := false
+	var currentFingerprint fingerprint
+	defer a.stopApplied(key)
+	for {
+		select {
+		// used by external services (API) to notify that rule is updated.
+		case ctx := <-a.updateCh:
+			if currentFingerprint == ctx.Fingerprint {
+				logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
+				continue
+			}
+
+			logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
+			// clear the state. So the next evaluation will start from the scratch.
+			a.resetState(grafanaCtx, key, ctx.IsPaused)
+			currentFingerprint = ctx.Fingerprint
+		// evalCh - used by the scheduler to signal that evaluation is needed.
+		case ctx, ok := <-a.evalCh:
+			if !ok {
+				logger.Debug("Evaluation channel has been closed. Exiting")
+				return nil
+			}
+			if evalRunning {
+				continue
+			}
+
+			func() {
+				evalRunning = true
+				defer func() {
+					evalRunning = false
+					a.evalApplied(key, ctx.scheduledAt)
+				}()
+
+				for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
+					isPaused := ctx.rule.IsPaused
+					f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
+					// Do not clean up state if the eval loop has just started.
+					var needReset bool
+					if currentFingerprint != 0 && currentFingerprint != f {
+						logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
+						needReset = true
+					}
+					// We need to reset state if the loop has started and the alert is already paused. It can happen,
+					// if we have an alert with state and we do file provision with stateful Grafana, that state
+					// lingers in DB and won't be cleaned up until next alert rule update.
+					needReset = needReset || (currentFingerprint == 0 && isPaused)
+					if needReset {
+						a.resetState(grafanaCtx, key, isPaused)
+					}
+					currentFingerprint = f
+					if isPaused {
+						logger.Debug("Skip rule evaluation because it is paused")
+						return
+					}
+
+					fpStr := currentFingerprint.String()
+					utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
+					tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
+						attribute.String("rule_uid", ctx.rule.UID),
+						attribute.Int64("org_id", ctx.rule.OrgID),
+						attribute.Int64("rule_version", ctx.rule.Version),
+						attribute.String("rule_fingerprint", fpStr),
+						attribute.String("tick", utcTick),
+					))
+
+					// Check before any execution if the context was cancelled so that we don't do any evaluations.
+					if tracingCtx.Err() != nil {
+						span.SetStatus(codes.Error, "rule evaluation cancelled")
+						span.End()
+						logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
+						return
+					}
+
+					retry := attempt < a.maxAttempts
+					err := a.evaluate(tracingCtx, key, f, attempt, ctx, span, retry)
+					// This is extremely confusing - when we exhaust all retry attempts, or we have no retryable errors
+					// we return nil - so technically, this is meaningless to know whether the evaluation has errors or not.
+					span.End()
+					if err == nil {
+						return
+					}
+
+					logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
+					select {
+					case <-tracingCtx.Done():
+						logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
+						return
+					case <-time.After(retryDelay):
+						continue
+					}
+				}
+			}()
+
+		case <-grafanaCtx.Done():
+			// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
+			if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
+				// We do not want a context to be unbounded which could potentially cause a go routine running
+				// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
+				// cases.
+				ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
+				defer cancelFunc()
+				states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key, ngmodels.StateReasonRuleDeleted)
+				a.notify(grafanaCtx, key, states)
+			}
+			logger.Debug("Stopping alert rule routine")
+			return nil
+		}
+	}
+}
+
+func (a *alertRuleInfo) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
 	orgID := fmt.Sprint(key.OrgID)
 	evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
 	evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
@@ -193,25 +303,7 @@ func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
 	processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID)
 	sendDuration := a.metrics.SendDuration.WithLabelValues(orgID)

-	notify := func(states []state.StateTransition) {
-		expiredAlerts := state.FromAlertsStateToStoppedAlert(states, a.appURL, a.clock)
-		if len(expiredAlerts.PostableAlerts) > 0 {
-			a.sender.Send(grafanaCtx, key, expiredAlerts)
-		}
-	}
-
-	resetState := func(ctx context.Context, isPaused bool) {
-		rule := a.ruleProvider.get(key)
-		reason := ngmodels.StateReasonUpdated
-		if isPaused {
-			reason = ngmodels.StateReasonPaused
-		}
-		states := a.stateManager.ResetStateByRuleUID(ctx, rule, reason)
-		notify(states)
-	}
-
-	evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
-		logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx)
+	logger := a.logger.FromContext(ctx).New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx)
 	start := a.clock.Now()

 	evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), a.newLoadedMetricsReader(e.rule))
@@ -299,116 +391,23 @@ func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
 	sendDuration.Observe(a.clock.Now().Sub(start).Seconds())

 	return nil
-	}
+}

-	evalRunning := false
-	var currentFingerprint fingerprint
-	defer a.stopApplied(key)
-	for {
-		select {
-		// used by external services (API) to notify that rule is updated.
-		case ctx := <-a.updateCh:
-			if currentFingerprint == ctx.Fingerprint {
-				logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
-				continue
+func (a *alertRuleInfo) notify(ctx context.Context, key ngmodels.AlertRuleKey, states []state.StateTransition) {
+	expiredAlerts := state.FromAlertsStateToStoppedAlert(states, a.appURL, a.clock)
+	if len(expiredAlerts.PostableAlerts) > 0 {
+		a.sender.Send(ctx, key, expiredAlerts)
 	}
+}

-			logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
-			// clear the state. So the next evaluation will start from the scratch.
-			resetState(grafanaCtx, ctx.IsPaused)
-			currentFingerprint = ctx.Fingerprint
-		// evalCh - used by the scheduler to signal that evaluation is needed.
-		case ctx, ok := <-a.evalCh:
-			if !ok {
-				logger.Debug("Evaluation channel has been closed. Exiting")
-				return nil
-			}
-			if evalRunning {
-				continue
-			}
-
-			func() {
-				evalRunning = true
-				defer func() {
-					evalRunning = false
-					a.evalApplied(key, ctx.scheduledAt)
-				}()
-
-				for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
-					isPaused := ctx.rule.IsPaused
-					f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
-					// Do not clean up state if the eval loop has just started.
-					var needReset bool
-					if currentFingerprint != 0 && currentFingerprint != f {
-						logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
-						needReset = true
-					}
-					// We need to reset state if the loop has started and the alert is already paused. It can happen,
-					// if we have an alert with state and we do file provision with stateful Grafana, that state
-					// lingers in DB and won't be cleaned up until next alert rule update.
-					needReset = needReset || (currentFingerprint == 0 && isPaused)
-					if needReset {
-						resetState(grafanaCtx, isPaused)
-					}
-					currentFingerprint = f
+func (a *alertRuleInfo) resetState(ctx context.Context, key ngmodels.AlertRuleKey, isPaused bool) {
+	rule := a.ruleProvider.get(key)
+	reason := ngmodels.StateReasonUpdated
 	if isPaused {
-						logger.Debug("Skip rule evaluation because it is paused")
-						return
-					}
-
-					fpStr := currentFingerprint.String()
-					utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
-					tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
-						attribute.String("rule_uid", ctx.rule.UID),
-						attribute.Int64("org_id", ctx.rule.OrgID),
-						attribute.Int64("rule_version", ctx.rule.Version),
-						attribute.String("rule_fingerprint", fpStr),
-						attribute.String("tick", utcTick),
-					))
-
-					// Check before any execution if the context was cancelled so that we don't do any evaluations.
-					if tracingCtx.Err() != nil {
-						span.SetStatus(codes.Error, "rule evaluation cancelled")
-						span.End()
-						logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
-						return
-					}
-
-					retry := attempt < a.maxAttempts
-					err := evaluate(tracingCtx, f, attempt, ctx, span, retry)
-					// This is extremely confusing - when we exhaust all retry attempts, or we have no retryable errors
-					// we return nil - so technically, this is meaningless to know whether the evaluation has errors or not.
-					span.End()
-					if err == nil {
-						return
-					}
-
-					logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
-					select {
-					case <-tracingCtx.Done():
-						logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
-						return
-					case <-time.After(retryDelay):
-						continue
-					}
-				}
-			}()
-
-		case <-grafanaCtx.Done():
-			// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
-			if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
-				// We do not want a context to be unbounded which could potentially cause a go routine running
-				// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
-				// cases.
-				ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
-				defer cancelFunc()
-				states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key, ngmodels.StateReasonRuleDeleted)
-				notify(states)
-			}
-			logger.Debug("Stopping alert rule routine")
-			return nil
-		}
+		reason = ngmodels.StateReasonPaused
 	}
+	states := a.stateManager.ResetStateByRuleUID(ctx, rule, reason)
+	a.notify(ctx, key, states)
 }

 // evalApplied is only used on tests.