Alerting: Add alert pausing feature (#60734)

* Add field in alert_rule model, add state to alert_instance model, and state to eval

* Remove paused state from eval package

* Skip paused alert rules in scheduler

* Add migration to add is_paused field to alert_rule table

* Convert to postable alerts only if not normal, pernding, or paused

* Handle paused eval results in state manager

* Add Paused state to eval package

* Add paused alerts logic in scheduler

* Skip alert on scheduler

* Remove paused status from eval package

* Apply suggestions from code review

Co-authored-by: George Robinson <george.robinson@grafana.com>

* Remove state

* Rethink schedule and manager for paused alerts

* Change return to continue

* Remove unused var

* Rethink alert pausing

* Paused alerts storing annotations

* Only add one state transition

* Revert boolean method renaming refactor

* Revert take image refactor

* Make registry errors public

* Revert method extraction for getting a folder title

* Revert variable renaming refactor

* Undo unnecessary changes

* Revert changes in test

* Remove IsPause check in PatchPartiLAlertRule function

* Use SetNormal to set state

* Fix text by returning to old behaviour on alert rule deletion

* Add test in schedule_unit_test.go to test ticks with paused alerts

* Add coment to clarify usage of context.Background()

* Add comment to clarify resetStateByRuleUID method usage

* Move rule get to a more limited scope

* Update pkg/services/ngalert/schedule/schedule.go

Co-authored-by: George Robinson <george.robinson@grafana.com>

* rum gofmt on pkg/services/ngalert/schedule/schedule.go

* Remove defer cancel for context

* Update pkg/services/ngalert/models/instance_test.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* Update pkg/services/ngalert/models/testing.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* Update pkg/services/ngalert/schedule/schedule_unit_test.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* Update pkg/services/ngalert/schedule/schedule_unit_test.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* Update pkg/services/ngalert/models/instance_test.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* skip scheduler rule state clean up on paused alert rule

* Update pkg/services/ngalert/schedule/schedule.go

Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>

* Fix mock in test

* Add (hopefully) final suggestions

* Use error channel from recordAnnotationsSync to cancel context

* Run make gen-cue

* Place pause alert check in channel update after version check

* Reduce branching un update channel select

* Add if for error and move code inside if in state manager ResetStateByRuleUID

* Add reason to logs

* Update pkg/services/ngalert/schedule/schedule.go

Co-authored-by: George Robinson <george.robinson@grafana.com>

* Do not delete alert rule routine, just exit on eval if is paused

* Reduce branching and create-close a channel to avoid deadlocks

* Separate state deletion and state reset (includes history saving)

* Add current pause state in rule route in scheduler

* Split clearState and bring errCh closer to RecordStatesAsync call

* Change rule to ruleMeta in RecordStatesAsync

* copy state to be able to modify it

* Add timeout to context creation

* Shorten the timeout

* Use resetState is rule is paused and deleteState if rule is not paused

* Remove Empty state reason

* Save every rule change in historian

* Add tests for DeleteStateByRuleUID and ResetStateByRuleUID

* Remove useless line

* Remove outdated comment

Co-authored-by: George Robinson <george.robinson@grafana.com>
Co-authored-by: Santiago <santiagohernandez.1997@gmail.com>
Co-authored-by: Armand Grillet <2117580+armandgrillet@users.noreply.github.com>
This commit is contained in:
Alex Moreno
2023-01-26 18:29:10 +01:00
committed by GitHub
parent 039b30b1ea
commit 531b439cf1
19 changed files with 554 additions and 62 deletions

View File

@@ -37,7 +37,7 @@ type ScheduleService interface {
// an error. The scheduler is terminated when this function returns.
Run(context.Context) error
// UpdateAlertRule notifies scheduler that a rule has been changed
UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64)
UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64, isPaused bool)
// DeleteAlertRule notifies scheduler that rules have been deleted
DeleteAlertRule(keys ...ngmodels.AlertRuleKey)
}
@@ -150,12 +150,12 @@ func (sch *schedule) Run(ctx context.Context) error {
}
// UpdateAlertRule looks for the active rule evaluation and commands it to update the rule
func (sch *schedule) UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64) {
func (sch *schedule) UpdateAlertRule(key ngmodels.AlertRuleKey, lastVersion int64, isPaused bool) {
ruleInfo, err := sch.registry.get(key)
if err != nil {
return
}
ruleInfo.update(ruleVersion(lastVersion))
ruleInfo.update(ruleVersionAndPauseStatus{ruleVersion(lastVersion), isPaused})
}
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
@@ -314,7 +314,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
return readyToRun, registeredDefinitions
}
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersion) error {
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersionAndPauseStatus) error {
grafanaCtx = ngmodels.WithRuleKey(grafanaCtx, key)
logger := sch.log.FromContext(grafanaCtx)
logger.Debug("Alert rule routine started")
@@ -324,14 +324,23 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
clearState := func() {
states := sch.stateManager.ResetStateByRuleUID(grafanaCtx, key)
notify := func(states []*state.State) {
expiredAlerts := FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
if len(expiredAlerts.PostableAlerts) > 0 {
sch.alertsSender.Send(key, expiredAlerts)
}
}
resetState := func(ctx context.Context, isPaused bool) {
rule := sch.schedulableAlertRules.get(key)
reason := ngmodels.StateReasonUpdated
if isPaused {
reason = ngmodels.StateReasonPaused
}
states := sch.stateManager.ResetStateByRuleUID(ctx, rule, reason)
notify(states)
}
evaluate := func(ctx context.Context, attempt int64, e *evaluation, span tracing.Span) {
logger := logger.New("version", e.rule.Version, "attempt", attempt, "now", e.scheduledAt)
start := sch.clock.Now()
@@ -431,18 +440,19 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
for {
select {
// used by external services (API) to notify that rule is updated.
case lastVersion := <-updateCh:
case ctx := <-updateCh:
// sometimes it can happen when, for example, the rule evaluation took so long,
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
// therefore, at the time when message from updateCh is processed the current rule will have
// at least the same version (or greater) and the state created for the new version of the rule.
if currentRuleVersion >= int64(lastVersion) {
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", lastVersion)
if currentRuleVersion >= int64(ctx.Version) {
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", ctx.Version)
continue
}
logger.Info("Clearing the state of the rule because version has changed", "version", currentRuleVersion, "newVersion", lastVersion)
logger.Info("Clearing the state of the rule because it was updated", "version", currentRuleVersion, "newVersion", ctx.Version, "isPaused", ctx.IsPaused)
// clear the state. So the next evaluation will start from the scratch.
clearState()
resetState(grafanaCtx, ctx.IsPaused)
// evalCh - used by the scheduler to signal that evaluation is needed.
case ctx, ok := <-evalCh:
if !ok {
@@ -462,14 +472,18 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
err := retryIfError(func(attempt int64) error {
newVersion := ctx.rule.Version
isPaused := ctx.rule.IsPaused
// fetch latest alert rule version
if currentRuleVersion != newVersion {
if currentRuleVersion > 0 { // do not clean up state if the eval loop has just started.
logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
clearState()
resetState(grafanaCtx, isPaused)
}
currentRuleVersion = newVersion
}
if isPaused {
return nil
}
tracingCtx, span := sch.tracer.Start(grafanaCtx, "alert rule execution")
defer span.End()
@@ -489,7 +503,13 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
case <-grafanaCtx.Done():
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
clearState()
// We do not want a context to be unbounded which could potentially cause a go routine running
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
// cases.
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
defer cancelFunc()
states := sch.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key)
notify(states)
}
logger.Debug("Stopping alert rule routine")
return nil