mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Scheduler use rule fingerprint instead of version (#66531)
* implement calculation of fingerprint for ruleWithFolder * update scheduler to use fingerprint instead of rule's version
This commit is contained in:
@@ -271,16 +271,18 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
|
||||
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
|
||||
isReadyToRun := item.IntervalSeconds != 0 && tickNum%itemFrequency == 0
|
||||
if isReadyToRun {
|
||||
var folderTitle string
|
||||
if !sch.disableGrafanaFolder {
|
||||
title, ok := folderTitles[item.NamespaceUID]
|
||||
if ok {
|
||||
folderTitle = title
|
||||
} else {
|
||||
missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
|
||||
}
|
||||
|
||||
var folderTitle string
|
||||
if !sch.disableGrafanaFolder {
|
||||
title, ok := folderTitles[item.NamespaceUID]
|
||||
if ok {
|
||||
folderTitle = title
|
||||
} else {
|
||||
missingFolder[item.NamespaceUID] = append(missingFolder[item.NamespaceUID], item.UID)
|
||||
}
|
||||
}
|
||||
|
||||
if isReadyToRun {
|
||||
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
|
||||
scheduledAt: tick,
|
||||
rule: item,
|
||||
@@ -292,8 +294,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
sch.log.Debug("Rule has been updated. Notifying evaluation routine", key.LogContext()...)
|
||||
go func(ri *alertRuleInfo, rule *ngmodels.AlertRule) {
|
||||
ri.update(ruleVersionAndPauseStatus{
|
||||
Version: ruleVersion(rule.Version),
|
||||
IsPaused: rule.IsPaused,
|
||||
Fingerprint: ruleWithFolder{rule: rule, folderTitle: folderTitle}.Fingerprint(),
|
||||
IsPaused: rule.IsPaused,
|
||||
})
|
||||
}(ruleInfo, item)
|
||||
updatedRules = append(updatedRules, ngmodels.AlertRuleKeyWithVersion{
|
||||
@@ -369,8 +371,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
notify(states)
|
||||
}
|
||||
|
||||
evaluate := func(ctx context.Context, attempt int64, e *evaluation, span tracing.Span) {
|
||||
logger := logger.New("version", e.rule.Version, "attempt", attempt, "now", e.scheduledAt)
|
||||
evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span tracing.Span) {
|
||||
logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt)
|
||||
start := sch.clock.Now()
|
||||
|
||||
evalCtx := eval.NewContext(ctx, SchedulerUserFor(e.rule.OrgID))
|
||||
@@ -450,24 +452,21 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
}
|
||||
|
||||
evalRunning := false
|
||||
var currentRuleVersion int64 = 0
|
||||
var currentFingerprint fingerprint
|
||||
defer sch.stopApplied(key)
|
||||
for {
|
||||
select {
|
||||
// used by external services (API) to notify that rule is updated.
|
||||
case ctx := <-updateCh:
|
||||
// sometimes it can happen when, for example, the rule evaluation took so long,
|
||||
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
|
||||
// therefore, at the time when message from updateCh is processed the current rule will have
|
||||
// at least the same version (or greater) and the state created for the new version of the rule.
|
||||
if currentRuleVersion >= int64(ctx.Version) {
|
||||
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", ctx.Version)
|
||||
if currentFingerprint == ctx.Fingerprint {
|
||||
logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
|
||||
continue
|
||||
}
|
||||
|
||||
logger.Info("Clearing the state of the rule because it was updated", "version", currentRuleVersion, "newVersion", ctx.Version, "isPaused", ctx.IsPaused)
|
||||
logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
|
||||
// clear the state. So the next evaluation will start from the scratch.
|
||||
resetState(grafanaCtx, ctx.IsPaused)
|
||||
currentFingerprint = ctx.Fingerprint
|
||||
// evalCh - used by the scheduler to signal that evaluation is needed.
|
||||
case ctx, ok := <-evalCh:
|
||||
if !ok {
|
||||
@@ -486,21 +485,24 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
}()
|
||||
|
||||
err := retryIfError(func(attempt int64) error {
|
||||
newVersion := ctx.rule.Version
|
||||
isPaused := ctx.rule.IsPaused
|
||||
// fetch latest alert rule version
|
||||
if currentRuleVersion != newVersion {
|
||||
// Do not clean up state if the eval loop has just started.
|
||||
// We need to reset state if the loop has started and the alert is already paused. It can happen,
|
||||
// if we have an alert with state and we do file provision with stateful Grafana, that state
|
||||
// lingers in DB and won't be cleaned up until next alert rule update.
|
||||
if currentRuleVersion > 0 || isPaused {
|
||||
logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
|
||||
resetState(grafanaCtx, isPaused)
|
||||
}
|
||||
currentRuleVersion = newVersion
|
||||
f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
|
||||
// Do not clean up state if the eval loop has just started.
|
||||
var needReset bool
|
||||
if currentFingerprint != 0 && currentFingerprint != f {
|
||||
logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
|
||||
needReset = true
|
||||
}
|
||||
// We need to reset state if the loop has started and the alert is already paused. It can happen,
|
||||
// if we have an alert with state and we do file provision with stateful Grafana, that state
|
||||
// lingers in DB and won't be cleaned up until next alert rule update.
|
||||
needReset = needReset || (currentFingerprint == 0 && isPaused)
|
||||
if needReset {
|
||||
resetState(grafanaCtx, isPaused)
|
||||
}
|
||||
currentFingerprint = f
|
||||
if isPaused {
|
||||
logger.Debug("Skip rule evaluation because it is paused")
|
||||
return nil
|
||||
}
|
||||
tracingCtx, span := sch.tracer.Start(grafanaCtx, "alert rule execution")
|
||||
@@ -509,10 +511,12 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
span.SetAttributes("rule_uid", ctx.rule.UID, attribute.String("rule_uid", ctx.rule.UID))
|
||||
span.SetAttributes("org_id", ctx.rule.OrgID, attribute.Int64("org_id", ctx.rule.OrgID))
|
||||
span.SetAttributes("rule_version", ctx.rule.Version, attribute.Int64("rule_version", ctx.rule.Version))
|
||||
fpStr := currentFingerprint.String()
|
||||
span.SetAttributes("rule_fingerprint", fpStr, attribute.String("rule_fingerprint", fpStr))
|
||||
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
|
||||
span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))
|
||||
|
||||
evaluate(tracingCtx, attempt, ctx, span)
|
||||
evaluate(tracingCtx, f, attempt, ctx, span)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user