mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add alert pausing feature (#60734)
* Add field in alert_rule model, add state to alert_instance model, and state to eval * Remove paused state from eval package * Skip paused alert rules in scheduler * Add migration to add is_paused field to alert_rule table * Convert to postable alerts only if not normal, pernding, or paused * Handle paused eval results in state manager * Add Paused state to eval package * Add paused alerts logic in scheduler * Skip alert on scheduler * Remove paused status from eval package * Apply suggestions from code review Co-authored-by: George Robinson <george.robinson@grafana.com> * Remove state * Rethink schedule and manager for paused alerts * Change return to continue * Remove unused var * Rethink alert pausing * Paused alerts storing annotations * Only add one state transition * Revert boolean method renaming refactor * Revert take image refactor * Make registry errors public * Revert method extraction for getting a folder title * Revert variable renaming refactor * Undo unnecessary changes * Revert changes in test * Remove IsPause check in PatchPartiLAlertRule function * Use SetNormal to set state * Fix text by returning to old behaviour on alert rule deletion * Add test in schedule_unit_test.go to test ticks with paused alerts * Add coment to clarify usage of context.Background() * Add comment to clarify resetStateByRuleUID method usage * Move rule get to a more limited scope * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * rum gofmt on pkg/services/ngalert/schedule/schedule.go * Remove defer cancel for context * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/testing.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * skip scheduler rule state clean up on paused alert rule * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Fix mock in test * Add (hopefully) final suggestions * Use error channel from recordAnnotationsSync to cancel context * Run make gen-cue * Place pause alert check in channel update after version check * Reduce branching un update channel select * Add if for error and move code inside if in state manager ResetStateByRuleUID * Add reason to logs * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * Do not delete alert rule routine, just exit on eval if is paused * Reduce branching and create-close a channel to avoid deadlocks * Separate state deletion and state reset (includes history saving) * Add current pause state in rule route in scheduler * Split clearState and bring errCh closer to RecordStatesAsync call * Change rule to ruleMeta in RecordStatesAsync * copy state to be able to modify it * Add timeout to context creation * Shorten the timeout * Use resetState is rule is paused and deleteState if rule is not paused * Remove Empty state reason * Save every rule change in historian * Add tests for DeleteStateByRuleUID and ResetStateByRuleUID * Remove useless line * Remove outdated comment Co-authored-by: George Robinson <george.robinson@grafana.com> Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> Co-authored-by: Armand Grillet <2117580+armandgrillet@users.noreply.github.com>
This commit is contained in:
@@ -37,7 +37,7 @@ type ScheduleService interface {
|
||||
// an error. The scheduler is terminated when this function returns.
|
||||
Run(context.Context) error
|
||||
// UpdateAlertRule notifies scheduler that a rule has been changed
|
||||
UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64)
|
||||
UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64, isPaused bool)
|
||||
// DeleteAlertRule notifies scheduler that rules have been deleted
|
||||
DeleteAlertRule(keys ...ngmodels.AlertRuleKey)
|
||||
}
|
||||
@@ -150,12 +150,12 @@ func (sch *schedule) Run(ctx context.Context) error {
|
||||
}
|
||||
|
||||
// UpdateAlertRule looks for the active rule evaluation and commands it to update the rule
|
||||
func (sch *schedule) UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64) {
|
||||
func (sch *schedule) UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64, isPaused bool) {
|
||||
ruleInfo, err := sch.registry.get(key)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
ruleInfo.update(ruleVersion(lastVersion))
|
||||
ruleInfo.update(ruleVersionAndPauseStatus{ruleVersion(lastVersion), isPaused})
|
||||
}
|
||||
|
||||
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
||||
@@ -314,7 +314,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
|
||||
return readyToRun, registeredDefinitions
|
||||
}
|
||||
|
||||
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersion) error {
|
||||
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersionAndPauseStatus) error {
|
||||
grafanaCtx = ngmodels.WithRuleKey(grafanaCtx, key)
|
||||
logger := sch.log.FromContext(grafanaCtx)
|
||||
logger.Debug("Alert rule routine started")
|
||||
@@ -324,14 +324,23 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
|
||||
evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
|
||||
|
||||
clearState := func() {
|
||||
states := sch.stateManager.ResetStateByRuleUID(grafanaCtx, key)
|
||||
notify := func(states []*state.State) {
|
||||
expiredAlerts := FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
|
||||
if len(expiredAlerts.PostableAlerts) > 0 {
|
||||
sch.alertsSender.Send(key, expiredAlerts)
|
||||
}
|
||||
}
|
||||
|
||||
resetState := func(ctx context.Context, isPaused bool) {
|
||||
rule := sch.schedulableAlertRules.get(key)
|
||||
reason := ngmodels.StateReasonUpdated
|
||||
if isPaused {
|
||||
reason = ngmodels.StateReasonPaused
|
||||
}
|
||||
states := sch.stateManager.ResetStateByRuleUID(ctx, rule, reason)
|
||||
notify(states)
|
||||
}
|
||||
|
||||
evaluate := func(ctx context.Context, attempt int64, e *evaluation, span tracing.Span) {
|
||||
logger := logger.New("version", e.rule.Version, "attempt", attempt, "now", e.scheduledAt)
|
||||
start := sch.clock.Now()
|
||||
@@ -431,18 +440,19 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
for {
|
||||
select {
|
||||
// used by external services (API) to notify that rule is updated.
|
||||
case lastVersion := <-updateCh:
|
||||
case ctx := <-updateCh:
|
||||
// sometimes it can happen when, for example, the rule evaluation took so long,
|
||||
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
|
||||
// therefore, at the time when message from updateCh is processed the current rule will have
|
||||
// at least the same version (or greater) and the state created for the new version of the rule.
|
||||
if currentRuleVersion >= int64(lastVersion) {
|
||||
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", lastVersion)
|
||||
if currentRuleVersion >= int64(ctx.Version) {
|
||||
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", ctx.Version)
|
||||
continue
|
||||
}
|
||||
logger.Info("Clearing the state of the rule because version has changed", "version", currentRuleVersion, "newVersion", lastVersion)
|
||||
|
||||
logger.Info("Clearing the state of the rule because it was updated", "version", currentRuleVersion, "newVersion", ctx.Version, "isPaused", ctx.IsPaused)
|
||||
// clear the state. So the next evaluation will start from the scratch.
|
||||
clearState()
|
||||
resetState(grafanaCtx, ctx.IsPaused)
|
||||
// evalCh - used by the scheduler to signal that evaluation is needed.
|
||||
case ctx, ok := <-evalCh:
|
||||
if !ok {
|
||||
@@ -462,14 +472,18 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
|
||||
err := retryIfError(func(attempt int64) error {
|
||||
newVersion := ctx.rule.Version
|
||||
isPaused := ctx.rule.IsPaused
|
||||
// fetch latest alert rule version
|
||||
if currentRuleVersion != newVersion {
|
||||
if currentRuleVersion > 0 { // do not clean up state if the eval loop has just started.
|
||||
logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
|
||||
clearState()
|
||||
resetState(grafanaCtx, isPaused)
|
||||
}
|
||||
currentRuleVersion = newVersion
|
||||
}
|
||||
if isPaused {
|
||||
return nil
|
||||
}
|
||||
tracingCtx, span := sch.tracer.Start(grafanaCtx, "alert rule execution")
|
||||
defer span.End()
|
||||
|
||||
@@ -489,7 +503,13 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
case <-grafanaCtx.Done():
|
||||
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
|
||||
if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
|
||||
clearState()
|
||||
// We do not want a context to be unbounded which could potentially cause a go routine running
|
||||
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
|
||||
// cases.
|
||||
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
|
||||
defer cancelFunc()
|
||||
states := sch.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key)
|
||||
notify(states)
|
||||
}
|
||||
logger.Debug("Stopping alert rule routine")
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user