Alerting: Delete state from the database on reset (#53919)

* make ResetStatesByRuleUID return states * delete rule states when reset * rule eval routine to clean up the state only when rule is deleted
2025-02-25 18:55:37 -06:00 · 2022-08-25 14:12:22 -04:00 · 2022-08-25 14:12:22 -04:00 · 03e746d9df
commit 03e746d9df
parent 5c0bf2ba39
8 changed files with 101 additions and 44 deletions
--- a/pkg/services/ngalert/schedule/registry.go
+++ b/pkg/services/ngalert/schedule/registry.go
@ -2,13 +2,17 @@ package schedule

 import (
 	"context"
+	"errors"
 	"fmt"
 	"sync"
 	"time"

 	"github.com/grafana/grafana/pkg/services/ngalert/models"
+	"github.com/grafana/grafana/pkg/util"
 )

+var errRuleDeleted = errors.New("rule deleted")
+
 type alertRuleInfoRegistry struct {
 	mu            sync.Mutex
 	alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
@ -78,12 +82,12 @@ type alertRuleInfo struct {
 	evalCh   chan *evaluation
 	updateCh chan ruleVersion
 	ctx      context.Context
-	stop     context.CancelFunc
+	stop     func(reason error)
 }

 func newAlertRuleInfo(parent context.Context) *alertRuleInfo {
-	ctx, cancel := context.WithCancel(parent)
-	return &alertRuleInfo{evalCh: make(chan *evaluation), updateCh: make(chan ruleVersion), ctx: ctx, stop: cancel}
+	ctx, stop := util.WithCancelCause(parent)
+	return &alertRuleInfo{evalCh: make(chan *evaluation), updateCh: make(chan ruleVersion), ctx: ctx, stop: stop}
 }

 // eval signals the rule evaluation routine to perform the evaluation of the rule. Does nothing if the loop is stopped.
--- a/pkg/services/ngalert/schedule/registry_test.go
+++ b/pkg/services/ngalert/schedule/registry_test.go
@ -153,7 +153,7 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 				resultCh <- evalResponse{result, dropped}
 			}()
 			runtime.Gosched()
-			r.stop()
+			r.stop(nil)
 			select {
 			case result := <-resultCh:
 				require.False(t, result.success)
@ -166,12 +166,13 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 	t.Run("when rule evaluation is stopped", func(t *testing.T) {
 		t.Run("Update should do nothing", func(t *testing.T) {
 			r := newAlertRuleInfo(context.Background())
-			r.stop()
+			r.stop(errRuleDeleted)
+			require.ErrorIs(t, r.ctx.Err(), errRuleDeleted)
 			require.False(t, r.update(ruleVersion(rand.Int63())))
 		})
 		t.Run("eval should do nothing", func(t *testing.T) {
 			r := newAlertRuleInfo(context.Background())
-			r.stop()
+			r.stop(nil)
 			rule := models.AlertRuleGen()()
 			success, dropped := r.eval(time.Now(), rule)
 			require.False(t, success)
@ -179,8 +180,14 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 		})
 		t.Run("stop should do nothing", func(t *testing.T) {
 			r := newAlertRuleInfo(context.Background())
-			r.stop()
-			r.stop()
+			r.stop(nil)
+			r.stop(nil)
+		})
+		t.Run("stop should do nothing if parent context stopped", func(t *testing.T) {
+			ctx, cancelFn := context.WithCancel(context.Background())
+			r := newAlertRuleInfo(ctx)
+			cancelFn()
+			r.stop(nil)
 		})
 	})
 	t.Run("should be thread-safe", func(t *testing.T) {
@ -213,7 +220,7 @@ func TestSchedule_alertRuleInfo(t *testing.T) {
 					case 2:
 						r.eval(time.Now(), models.AlertRuleGen()())
 					case 3:
-						r.stop()
+						r.stop(nil)
 					}
 				}
 				wg.Done()
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -2,6 +2,7 @@ package schedule

 import (
 	"context"
+	"errors"
 	"fmt"
 	"net/url"
 	"time"
@ -168,7 +169,7 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
 			continue
 		}
 		// stop rule evaluation
-		ruleInfo.stop()
+		ruleInfo.stop(errRuleDeleted)
 	}
 	// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
 	alertRules := sch.schedulableAlertRules.all()
@ -286,7 +287,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
 }

 func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersion) error {
-	logger := sch.log.New("uid", key.UID, "org", key.OrgID)
+	logger := sch.log.New(key.LogContext()...)
 	logger.Debug("alert rule routine started")

 	orgID := fmt.Sprint(key.OrgID)
@ -295,9 +296,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 	evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)

 	clearState := func() {
-		states := sch.stateManager.GetStatesForRuleUID(key.OrgID, key.UID)
+		states := sch.stateManager.ResetStateByRuleUID(grafanaCtx, key)
 		expiredAlerts := FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
-		sch.stateManager.RemoveByRuleUID(key.OrgID, key.UID)
 		if len(expiredAlerts.PostableAlerts) > 0 {
 			sch.alertsSender.Send(key, expiredAlerts)
 		}
@ -317,7 +317,10 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 		} else {
 			logger.Debug("alert rule evaluated", "results", results, "duration", dur)
 		}
-
+		if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
+			logger.Debug("skip updating the state because the context has been cancelled")
+			return
+		}
 		processedStates := sch.stateManager.ProcessEvalResults(ctx, e.scheduledAt, e.rule, results, extraLabels)
 		alerts := FromAlertStateToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
 		if len(alerts.PostableAlerts) > 0 {
@ -396,7 +399,10 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 				}
 			}()
 		case <-grafanaCtx.Done():
-			clearState()
+			// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
+			if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
+				clearState()
+			}
 			logger.Debug("stopping alert rule routine")
 			return nil
 		}
--- a/pkg/services/ngalert/schedule/schedule_unit_test.go
+++ b/pkg/services/ngalert/schedule/schedule_unit_test.go
@ -170,10 +170,15 @@ func TestSchedule_ruleRoutine(t *testing.T) {
 	}

 	t.Run("should exit", func(t *testing.T) {
-		t.Run("when context is cancelled", func(t *testing.T) {
+		t.Run("and not clear the state if parent context is cancelled", func(t *testing.T) {
 			stoppedChan := make(chan error)
 			sch, _, _, _ := createSchedule(make(chan time.Time), nil)

+			rule := models.AlertRuleGen()()
+			_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
+			expectedStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
+			require.NotEmpty(t, expectedStates)
+
 			ctx, cancel := context.WithCancel(context.Background())
 			go func() {
 				err := sch.ruleRoutine(ctx, models.AlertRuleKey{}, make(chan *evaluation), make(chan ruleVersion))
@ -183,6 +188,27 @@ func TestSchedule_ruleRoutine(t *testing.T) {
 			cancel()
 			err := waitForErrChannel(t, stoppedChan)
 			require.NoError(t, err)
+			require.Equal(t, len(expectedStates), len(sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)))
+		})
+		t.Run("and clean up the state if delete is cancellation reason ", func(t *testing.T) {
+			stoppedChan := make(chan error)
+			sch, _, _, _ := createSchedule(make(chan time.Time), nil)
+
+			rule := models.AlertRuleGen()()
+			_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
+			require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
+
+			ctx, cancel := util.WithCancelCause(context.Background())
+			go func() {
+				err := sch.ruleRoutine(ctx, rule.GetKey(), make(chan *evaluation), make(chan ruleVersion))
+				stoppedChan <- err
+			}()
+
+			cancel(errRuleDeleted)
+			err := waitForErrChannel(t, stoppedChan)
+			require.NoError(t, err)
+
+			require.Empty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
 		})
 	})

@ -419,11 +445,11 @@ func TestSchedule_UpdateAlertRule(t *testing.T) {
 				t.Fatal("No message was received on update channel")
 			}
 		})
-		t.Run("should exit if it is closed", func(t *testing.T) {
+		t.Run("should exit if rule is being stopped", func(t *testing.T) {
 			sch := setupScheduler(t, nil, nil, nil, nil, nil)
 			key := models.GenerateRuleKey(rand.Int63())
 			info, _ := sch.registry.getOrCreateInfo(context.Background(), key)
-			info.stop()
+			info.stop(nil)
 			sch.UpdateAlertRule(key, rand.Int63())
 		})
 	})
@ -444,23 +470,7 @@ func TestSchedule_DeleteAlertRule(t *testing.T) {
 			key := rule.GetKey()
 			info, _ := sch.registry.getOrCreateInfo(context.Background(), key)
 			sch.DeleteAlertRule(key)
-			require.False(t, info.update(ruleVersion(rand.Int63())))
-			success, dropped := info.eval(time.Now(), rule)
-			require.False(t, success)
-			require.Nilf(t, dropped, "expected no dropped evaluations but got one")
-			require.False(t, sch.registry.exists(key))
-		})
-		t.Run("should remove controller from registry", func(t *testing.T) {
-			sch := setupScheduler(t, nil, nil, nil, nil, nil)
-			rule := models.AlertRuleGen()()
-			key := rule.GetKey()
-			info, _ := sch.registry.getOrCreateInfo(context.Background(), key)
-			info.stop()
-			sch.DeleteAlertRule(key)
-			require.False(t, info.update(ruleVersion(rand.Int63())))
-			success, dropped := info.eval(time.Now(), rule)
-			require.False(t, success)
-			require.Nilf(t, dropped, "expected no dropped evaluations but got one")
+			require.ErrorIs(t, info.ctx.Err(), errRuleDeleted)
 			require.False(t, sch.registry.exists(key))
 		})
 	})
--- a/pkg/services/ngalert/state/cache.go
+++ b/pkg/services/ngalert/state/cache.go
@ -180,11 +180,20 @@ func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
 	return ruleStates
 }

-// removeByRuleUID deletes all entries in the state cache that match the given UID.
-func (c *cache) removeByRuleUID(orgID int64, uid string) {
+// removeByRuleUID deletes all entries in the state cache that match the given UID. Returns removed states
+func (c *cache) removeByRuleUID(orgID int64, uid string) []*State {
 	c.mtxStates.Lock()
 	defer c.mtxStates.Unlock()
+	statesMap := c.states[orgID][uid]
 	delete(c.states[orgID], uid)
+	if statesMap == nil {
+		return nil
+	}
+	states := make([]*State, 0, len(statesMap))
+	for _, state := range statesMap {
+		states = append(states, state)
+	}
+	return states
 }

 func (c *cache) reset() {
--- a/pkg/services/ngalert/state/manager.go
+++ b/pkg/services/ngalert/state/manager.go
@ -73,7 +73,7 @@ func (st *Manager) Close(ctx context.Context) {

 func (st *Manager) Warm(ctx context.Context) {
 	st.log.Info("warming cache for startup")
-	st.ResetCache()
+	st.ResetAllStates()

 	orgIds, err := st.instanceStore.FetchOrgIds(ctx)
 	if err != nil {
@ -149,20 +149,30 @@ func (st *Manager) Get(orgID int64, alertRuleUID, stateId string) (*State, error
 	return st.cache.get(orgID, alertRuleUID, stateId)
 }

-// ResetCache is used to ensure a clean cache on startup.
-func (st *Manager) ResetCache() {
+// ResetAllStates is used to ensure a clean cache on startup.
+func (st *Manager) ResetAllStates() {
 	st.cache.reset()
 }

-// RemoveByRuleUID deletes all entries in the state manager that match the given rule UID.
-func (st *Manager) RemoveByRuleUID(orgID int64, ruleUID string) {
-	st.cache.removeByRuleUID(orgID, ruleUID)
+// ResetStateByRuleUID deletes all entries in the state manager that match the given rule UID.
+func (st *Manager) ResetStateByRuleUID(ctx context.Context, ruleKey ngModels.AlertRuleKey) []*State {
+	logger := st.log.New(ruleKey.LogContext()...)
+	logger.Debug("resetting state of the rule")
+	states := st.cache.removeByRuleUID(ruleKey.OrgID, ruleKey.UID)
+	if len(states) > 0 {
+		err := st.instanceStore.DeleteAlertInstancesByRule(ctx, ruleKey)
+		if err != nil {
+			logger.Error("failed to delete states that belong to a rule from database", ruleKey.LogContext()...)
+		}
+	}
+	logger.Info("rules state was reset", "deleted_states", len(states))
+	return states
 }

 // ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
 // if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
 func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []*State {
-	logger := st.log.New(alertRule.GetKey().LogContext())
+	logger := st.log.New(alertRule.GetKey().LogContext()...)
 	logger.Debug("state manager processing evaluation results", "resultCount", len(results))
 	var states []*State
 	processedResults := make(map[string]*State, len(results))
--- a/pkg/services/ngalert/store/instance_database.go
+++ b/pkg/services/ngalert/store/instance_database.go
@ -15,6 +15,7 @@ type InstanceStore interface {
 	SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error
 	FetchOrgIds(ctx context.Context) ([]int64, error)
 	DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error
+	DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error
 }

 // GetAlertInstance is a handler for retrieving an alert instance based on OrgId, AlertDefintionID, and
@ -158,3 +159,10 @@ func (st DBstore) DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID,
 		return nil
 	})
 }
+
+func (st DBstore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error {
+	return st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
+		_, err := sess.Exec("DELETE FROM alert_instance WHERE rule_org_id = ? AND rule_uid = ?", key.OrgID, key.UID)
+		return err
+	})
+}
--- a/pkg/services/ngalert/store/testing.go
+++ b/pkg/services/ngalert/store/testing.go
@ -404,6 +404,9 @@ func (f *FakeInstanceStore) FetchOrgIds(_ context.Context) ([]int64, error) { re
 func (f *FakeInstanceStore) DeleteAlertInstance(_ context.Context, _ int64, _, _ string) error {
 	return nil
 }
+func (f *FakeInstanceStore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error {
+	return nil
+}

 func NewFakeAdminConfigStore(t *testing.T) *FakeAdminConfigStore {
 	t.Helper()