Alerting: Scheduler use rule fingerprint instead of version (#66531)

* implement calculation of fingerprint for ruleWithFolder
* update scheduler to use fingerprint instead of rule's version
This commit is contained in:
Yuri Tseretyan
2023-04-28 10:42:16 -04:00
committed by GitHub
parent ffd83a027d
commit 9eb10bee1f
5 changed files with 367 additions and 83 deletions

View File

@@ -271,16 +271,18 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
if isReadyToRun {
var folderTitle string
if !sch.disableGrafanaFolder {
title, ok := folderTitles[item.NamespaceUID]
if ok {
folderTitle = title
} else {
missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
}
var folderTitle string
if !sch.disableGrafanaFolder {
title, ok := folderTitles[item.NamespaceUID]
if ok {
folderTitle = title
} else {
missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
}
}
if isReadyToRun {
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
scheduledAt: tick,
rule: item,
@@ -292,8 +294,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
sch.log.Debug("Rule has been updated. Notifying evaluation routine", key.LogContext()...)
go func(ri *alertRuleInfo, rule *ngmodels.AlertRule) {
ri.update(ruleVersionAndPauseStatus{
Version: ruleVersion(rule.Version),
IsPaused: rule.IsPaused,
Fingerprint: ruleWithFolder{rule: rule, folderTitle: folderTitle}.Fingerprint(),
IsPaused: rule.IsPaused,
})
}(ruleInfo, item)
updatedRules = append(updatedRules, ngmodels.AlertRuleKeyWithVersion{
@@ -369,8 +371,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
notify(states)
}
evaluate := func(ctx context.Context, attempt int64, e *evaluation, span tracing.Span) {
logger := logger.New("version", e.rule.Version, "attempt", attempt, "now", e.scheduledAt)
evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span tracing.Span) {
logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt)
start := sch.clock.Now()
evalCtx := eval.NewContext(ctx, SchedulerUserFor(e.rule.OrgID))
@@ -450,24 +452,21 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
}
evalRunning := false
var currentRuleVersion int64 = 0
var currentFingerprint fingerprint
defer sch.stopApplied(key)
for {
select {
// used by external services (API) to notify that rule is updated.
case ctx := <-updateCh:
// sometimes it can happen when, for example, the rule evaluation took so long,
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
// therefore, at the time when message from updateCh is processed the current rule will have
// at least the same version (or greater) and the state created for the new version of the rule.
if currentRuleVersion >= int64(ctx.Version) {
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", ctx.Version)
if currentFingerprint == ctx.Fingerprint {
logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
continue
}
logger.Info("Clearing the state of the rule because it was updated", "version", currentRuleVersion, "newVersion", ctx.Version, "isPaused", ctx.IsPaused)
logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
// clear the state. So the next evaluation will start from the scratch.
resetState(grafanaCtx, ctx.IsPaused)
currentFingerprint = ctx.Fingerprint
// evalCh - used by the scheduler to signal that evaluation is needed.
case ctx, ok := <-evalCh:
if !ok {
@@ -486,21 +485,24 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
}()
err := retryIfError(func(attempt int64) error {
newVersion := ctx.rule.Version
isPaused := ctx.rule.IsPaused
// fetch latest alert rule version
if currentRuleVersion != newVersion {
// Do not clean up state if the eval loop has just started.
// We need to reset state if the loop has started and the alert is already paused. It can happen,
// if we have an alert with state and we do file provision with stateful Grafana, that state
// lingers in DB and won't be cleaned up until next alert rule update.
if currentRuleVersion > 0 || isPaused {
logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
resetState(grafanaCtx, isPaused)
}
currentRuleVersion = newVersion
f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
// Do not clean up state if the eval loop has just started.
var needReset bool
if currentFingerprint != 0 && currentFingerprint != f {
logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
needReset = true
}
// We need to reset state if the loop has started and the alert is already paused. It can happen,
// if we have an alert with state and we do file provision with stateful Grafana, that state
// lingers in DB and won't be cleaned up until next alert rule update.
needReset = needReset || (currentFingerprint == 0 && isPaused)
if needReset {
resetState(grafanaCtx, isPaused)
}
currentFingerprint = f
if isPaused {
logger.Debug("Skip rule evaluation because it is paused")
return nil
}
tracingCtx, span := sch.tracer.Start(grafanaCtx, "alert rule execution")
@@ -509,10 +511,12 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
span.SetAttributes("rule_uid", ctx.rule.UID, attribute.String("rule_uid", ctx.rule.UID))
span.SetAttributes("org_id", ctx.rule.OrgID, attribute.Int64("org_id", ctx.rule.OrgID))
span.SetAttributes("rule_version", ctx.rule.Version, attribute.Int64("rule_version", ctx.rule.Version))
fpStr := currentFingerprint.String()
span.SetAttributes("rule_fingerprint", fpStr, attribute.String("rule_fingerprint", fpStr))
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))
evaluate(tracingCtx, attempt, ctx, span)
evaluate(tracingCtx, f, attempt, ctx, span)
return nil
})
if err != nil {