Alerting: Scheduler use rule fingerprint instead of version (#66531)

* implement calculation of fingerprint for ruleWithFolder * update scheduler to use fingerprint instead of rule's version
2025-02-25 18:55:37 -06:00 · 2023-04-28 10:42:16 -04:00
parent ffd83a027d
commit 9eb10bee1f
5 changed files with 367 additions and 83 deletions
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@@ -271,16 +271,18 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.

 		itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
 		isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
-		if isReadyToRun {
-			var folderTitle string
-			if !sch.disableGrafanaFolder {
-				title, ok := folderTitles[item.NamespaceUID]
-				if ok {
-					folderTitle = title
-				} else {
-					missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
-				}
+
+		var folderTitle string
+		if !sch.disableGrafanaFolder {
+			title, ok := folderTitles[item.NamespaceUID]
+			if ok {
+				folderTitle = title
+			} else {
+				missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
 			}
+		}
+
+		if isReadyToRun {
 			readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
 				scheduledAt: tick,
 				rule:        item,
@@ -292,8 +294,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
 			sch.log.Debug("Rule has been updated. Notifying evaluation routine", key.LogContext()...)
 			go func(ri *alertRuleInfo, rule *ngmodels.AlertRule) {
 				ri.update(ruleVersionAndPauseStatus{
-					Version:  ruleVersion(rule.Version),
-					IsPaused: rule.IsPaused,
+					Fingerprint: ruleWithFolder{rule: rule, folderTitle: folderTitle}.Fingerprint(),
+					IsPaused:    rule.IsPaused,
 				})
 			}(ruleInfo, item)
 			updatedRules = append(updatedRules, ngmodels.AlertRuleKeyWithVersion{
@@ -369,8 +371,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 		notify(states)
 	}

-	evaluate := func(ctx context.Context, attempt int64, e *evaluation, span tracing.Span) {
-		logger := logger.New("version", e.rule.Version, "attempt", attempt, "now", e.scheduledAt)
+	evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span tracing.Span) {
+		logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt)
 		start := sch.clock.Now()

 		evalCtx := eval.NewContext(ctx, SchedulerUserFor(e.rule.OrgID))
@@ -450,24 +452,21 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 	}

 	evalRunning := false
-	var currentRuleVersion int64 = 0
+	var currentFingerprint fingerprint
 	defer sch.stopApplied(key)
 	for {
 		select {
 		// used by external services (API) to notify that rule is updated.
 		case ctx := <-updateCh:
-			// sometimes it can happen when, for example, the rule evaluation took so long,
-			// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
-			// therefore, at the time when message from updateCh is processed the current rule will have
-			// at least the same version (or greater) and the state created for the new version of the rule.
-			if currentRuleVersion >= int64(ctx.Version) {
-				logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", ctx.Version)
+			if currentFingerprint == ctx.Fingerprint {
+				logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
 				continue
 			}

-			logger.Info("Clearing the state of the rule because it was updated", "version", currentRuleVersion, "newVersion", ctx.Version, "isPaused", ctx.IsPaused)
+			logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
 			// clear the state. So the next evaluation will start from the scratch.
 			resetState(grafanaCtx, ctx.IsPaused)
+			currentFingerprint = ctx.Fingerprint
 		// evalCh - used by the scheduler to signal that evaluation is needed.
 		case ctx, ok := <-evalCh:
 			if !ok {
@@ -486,21 +485,24 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 				}()

 				err := retryIfError(func(attempt int64) error {
-					newVersion := ctx.rule.Version
 					isPaused := ctx.rule.IsPaused
-					// fetch latest alert rule version
-					if currentRuleVersion != newVersion {
-						// Do not clean up state if the eval loop has just started.
-						// We need to reset state if the loop has started and the alert is already paused. It can happen,
-						// if we have an alert with state and we do file provision with stateful Grafana, that state
-						// lingers in DB and won't be cleaned up until next alert rule update.
-						if currentRuleVersion > 0 || isPaused {
-							logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
-							resetState(grafanaCtx, isPaused)
-						}
-						currentRuleVersion = newVersion
+					f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
+					// Do not clean up state if the eval loop has just started.
+					var needReset bool
+					if currentFingerprint != 0 && currentFingerprint != f {
+						logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
+						needReset = true
 					}
+					// We need to reset state if the loop has started and the alert is already paused. It can happen,
+					// if we have an alert with state and we do file provision with stateful Grafana, that state
+					// lingers in DB and won't be cleaned up until next alert rule update.
+					needReset = needReset || (currentFingerprint == 0 && isPaused)
+					if needReset {
+						resetState(grafanaCtx, isPaused)
+					}
+					currentFingerprint = f
 					if isPaused {
+						logger.Debug("Skip rule evaluation because it is paused")
 						return nil
 					}
 					tracingCtx, span := sch.tracer.Start(grafanaCtx, "alert rule execution")
@@ -509,10 +511,12 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 					span.SetAttributes("rule_uid", ctx.rule.UID, attribute.String("rule_uid", ctx.rule.UID))
 					span.SetAttributes("org_id", ctx.rule.OrgID, attribute.Int64("org_id", ctx.rule.OrgID))
 					span.SetAttributes("rule_version", ctx.rule.Version, attribute.Int64("rule_version", ctx.rule.Version))
+					fpStr := currentFingerprint.String()
+					span.SetAttributes("rule_fingerprint", fpStr, attribute.String("rule_fingerprint", fpStr))
 					utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
 					span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))

-					evaluate(tracingCtx, attempt, ctx, span)
+					evaluate(tracingCtx, f, attempt, ctx, span)
 					return nil
 				})
 				if err != nil {