Alerting: Persist alerts on evaluation and shutdown. Warm cache from DB on startup (#32576)

* Initial commit for state tracking * basic state transition logic and tests * constructor. test and interface fixup * use new sig for sch.definitionRoutine() * test fixup * make the linter happy * more minor linting cleanup * Alerting: Send alerts from state tracker to notifier * Add evaluation time and test Add evaluation time and test * Add cleanup routine and logging * Pull in compact.go and reconcile differences * Save alert transitions and save all state on shutdown * pr feedback * WIP * WIP * Persist alerts on evaluation and shutdown. Warm cache on startup * Filter non-firing alerts before sending to notifier Co-authored-by: Josue Abreu <josue@grafana.com>
2025-02-25 18:55:37 -06:00 · 2021-04-02 08:11:33 -07:00
parent 698a1ee003
commit 2a8446e435
10 changed files with 468 additions and 150 deletions
--- a/pkg/services/ngalert/state/state_tracker.go
+++ b/pkg/services/ngalert/state/state_tracker.go
@@ -7,21 +7,26 @@ import (

 	"github.com/grafana/grafana/pkg/infra/log"

-	"github.com/go-openapi/strfmt"
 	"github.com/grafana/grafana-plugin-sdk-go/data"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
 )

 type AlertState struct {
-	UID         string
-	CacheId     string
-	Labels      data.Labels
-	State       eval.State
-	Results     []eval.State
-	StartsAt    strfmt.DateTime
-	EndsAt      strfmt.DateTime
-	EvaluatedAt strfmt.DateTime
+	UID                string
+	OrgID              int64
+	CacheId            string
+	Labels             data.Labels
+	State              eval.State
+	Results            []StateEvaluation
+	StartsAt           time.Time
+	EndsAt             time.Time
+	LastEvaluationTime time.Time
+}
+
+type StateEvaluation struct {
+	EvaluationTime  time.Time
+	EvaluationState eval.State
 }

 type cache struct {
@@ -48,7 +53,7 @@ func NewStateTracker(logger log.Logger) *StateTracker {
 	return tracker
 }

-func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
+func (st *StateTracker) getOrCreate(uid string, orgId int64, result eval.Result) AlertState {
 	st.stateCache.mu.Lock()
 	defer st.stateCache.mu.Unlock()

@@ -58,12 +63,12 @@ func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
 	}
 	st.Log.Debug("adding new alert state cache entry", "cacheId", idString, "state", result.State.String(), "evaluatedAt", result.EvaluatedAt.String())
 	newState := AlertState{
-		UID:         uid,
-		CacheId:     idString,
-		Labels:      result.Instance,
-		State:       result.State,
-		Results:     []eval.State{},
-		EvaluatedAt: strfmt.DateTime(result.EvaluatedAt),
+		UID:     uid,
+		OrgID:   orgId,
+		CacheId: idString,
+		Labels:  result.Instance,
+		State:   result.State,
+		Results: []StateEvaluation{},
 	}
 	st.stateCache.cacheMap[idString] = newState
 	return newState
@@ -75,19 +80,25 @@ func (st *StateTracker) set(stateEntry AlertState) {
 	st.stateCache.cacheMap[stateEntry.CacheId] = stateEntry
 }

-func (st *StateTracker) get(stateId string) AlertState {
+func (st *StateTracker) Get(stateId string) AlertState {
 	st.stateCache.mu.Lock()
 	defer st.stateCache.mu.Unlock()
 	return st.stateCache.cacheMap[stateId]
 }

+//Used to ensure a clean cache on startup
+func (st *StateTracker) ResetCache() {
+	st.stateCache.mu.Lock()
+	defer st.stateCache.mu.Unlock()
+	st.stateCache.cacheMap = make(map[string]AlertState)
+}
+
 func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, condition ngModels.Condition) []AlertState {
 	st.Log.Info("state tracker processing evaluation results", "uid", uid, "resultCount", len(results))
 	var changedStates []AlertState
 	for _, result := range results {
-		if s, ok := st.setNextState(uid, result); ok {
-			changedStates = append(changedStates, s)
-		}
+		s, _ := st.setNextState(uid, condition.OrgID, result)
+		changedStates = append(changedStates, s)
 	}
 	st.Log.Debug("returning changed states to scheduler", "count", len(changedStates))
 	return changedStates
@@ -99,34 +110,43 @@ func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, con
 // 3. The base interval defined by the scheduler - in the case where #2 is not yet an option we can use the base interval at which every alert runs.
 //Set the current state based on evaluation results
 //return the state and a bool indicating whether a state transition occurred
-func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState, bool) {
-	currentState := st.getOrCreate(uid, result)
+func (st *StateTracker) setNextState(uid string, orgId int64, result eval.Result) (AlertState, bool) {
+	currentState := st.getOrCreate(uid, orgId, result)
 	st.Log.Debug("setting alert state", "uid", uid)
 	switch {
 	case currentState.State == result.State:
 		st.Log.Debug("no state transition", "cacheId", currentState.CacheId, "state", currentState.State.String())
-		currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
-		currentState.Results = append(currentState.Results, result.State)
+		currentState.LastEvaluationTime = result.EvaluatedAt
+		currentState.Results = append(currentState.Results, StateEvaluation{
+			EvaluationTime:  result.EvaluatedAt,
+			EvaluationState: result.State,
+		})
 		if currentState.State == eval.Alerting {
-			currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
+			currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
 		}
 		st.set(currentState)
 		return currentState, false
 	case currentState.State == eval.Normal && result.State == eval.Alerting:
 		st.Log.Debug("state transition from normal to alerting", "cacheId", currentState.CacheId)
 		currentState.State = eval.Alerting
-		currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
-		currentState.StartsAt = strfmt.DateTime(result.EvaluatedAt)
-		currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
-		currentState.Results = append(currentState.Results, result.State)
+		currentState.LastEvaluationTime = result.EvaluatedAt
+		currentState.StartsAt = result.EvaluatedAt
+		currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
+		currentState.Results = append(currentState.Results, StateEvaluation{
+			EvaluationTime:  result.EvaluatedAt,
+			EvaluationState: result.State,
+		})
 		st.set(currentState)
 		return currentState, true
 	case currentState.State == eval.Alerting && result.State == eval.Normal:
 		st.Log.Debug("state transition from alerting to normal", "cacheId", currentState.CacheId)
 		currentState.State = eval.Normal
-		currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
-		currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt)
-		currentState.Results = append(currentState.Results, result.State)
+		currentState.LastEvaluationTime = result.EvaluatedAt
+		currentState.EndsAt = result.EvaluatedAt
+		currentState.Results = append(currentState.Results, StateEvaluation{
+			EvaluationTime:  result.EvaluatedAt,
+			EvaluationState: result.State,
+		})
 		st.set(currentState)
 		return currentState, true
 	default:
@@ -134,6 +154,16 @@ func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState
 	}
 }

+func (st *StateTracker) GetAll() []AlertState {
+	var states []AlertState
+	st.stateCache.mu.Lock()
+	defer st.stateCache.mu.Unlock()
+	for _, v := range st.stateCache.cacheMap {
+		states = append(states, v)
+	}
+	return states
+}
+
 func (st *StateTracker) cleanUp() {
 	ticker := time.NewTicker(time.Duration(60) * time.Minute)
 	st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
@@ -150,16 +180,33 @@ func (st *StateTracker) cleanUp() {
 }

 func (st *StateTracker) trim() {
-	st.Log.Info("trimming alert state cache")
+	st.Log.Info("trimming alert state cache", "now", time.Now())
 	st.stateCache.mu.Lock()
 	defer st.stateCache.mu.Unlock()
 	for _, v := range st.stateCache.cacheMap {
 		if len(v.Results) > 100 {
 			st.Log.Debug("trimming result set", "cacheId", v.CacheId, "count", len(v.Results)-100)
-			newResults := make([]eval.State, 100)
+			newResults := make([]StateEvaluation, 100)
 			copy(newResults, v.Results[100:])
 			v.Results = newResults
 			st.set(v)
 		}
 	}
 }
+
+func (a AlertState) Equals(b AlertState) bool {
+	return a.UID == b.UID &&
+		a.OrgID == b.OrgID &&
+		a.CacheId == b.CacheId &&
+		a.Labels.String() == b.Labels.String() &&
+		a.State.String() == b.State.String() &&
+		a.StartsAt == b.StartsAt &&
+		a.EndsAt == b.EndsAt &&
+		a.LastEvaluationTime == b.LastEvaluationTime
+}
+
+func (st *StateTracker) Put(states []AlertState) {
+	for _, s := range states {
+		st.set(s)
+	}
+}
--- a/pkg/services/ngalert/state/state_tracker_test.go
+++ b/pkg/services/ngalert/state/state_tracker_test.go
@@ -1,13 +1,12 @@
 package state

 import (
+	"fmt"
 	"testing"
 	"time"

 	"github.com/grafana/grafana/pkg/infra/log"

-	"github.com/go-openapi/strfmt"
-
 	"github.com/grafana/grafana-plugin-sdk-go/data"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
@@ -34,22 +33,31 @@ func TestProcessEvalResults(t *testing.T) {
 			uid:  "test_uid",
 			evalResults: eval.Results{
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Normal,
+					EvaluatedAt: evaluationTime,
 				},
 			},
+			condition: models.Condition{
+				Condition: "A",
+				OrgID:     123,
+			},
 			expectedState:              eval.Normal,
 			expectedReturnedStateCount: 0,
 			expectedResultCount:        1,
 			expectedCacheEntries: []AlertState{
 				{
-					UID:         "test_uid",
-					CacheId:     "test_uid label1=value1, label2=value2",
-					Labels:      data.Labels{"label1": "value1", "label2": "value2"},
-					State:       eval.Normal,
-					Results:     []eval.State{eval.Normal},
-					StartsAt:    strfmt.DateTime{},
-					EndsAt:      strfmt.DateTime{},
-					EvaluatedAt: strfmt.DateTime(evaluationTime),
+					UID:     "test_uid",
+					OrgID:   123,
+					CacheId: "test_uid label1=value1, label2=value2",
+					Labels:  data.Labels{"label1": "value1", "label2": "value2"},
+					State:   eval.Normal,
+					Results: []StateEvaluation{
+						{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
+					},
+					StartsAt:           time.Time{},
+					EndsAt:             time.Time{},
+					LastEvaluationTime: evaluationTime,
 				},
 			},
 		},
@@ -58,27 +66,37 @@ func TestProcessEvalResults(t *testing.T) {
 			uid:  "test_uid",
 			evalResults: eval.Results{
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Normal,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Normal,
+					EvaluatedAt: evaluationTime,
 				},
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Alerting,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Alerting,
+					EvaluatedAt: evaluationTime.Add(1 * time.Minute),
 				},
 			},
+			condition: models.Condition{
+				Condition: "A",
+				OrgID:     123,
+			},
 			expectedState:              eval.Alerting,
 			expectedReturnedStateCount: 1,
 			expectedResultCount:        2,
 			expectedCacheEntries: []AlertState{
 				{
-					UID:         "test_uid",
-					CacheId:     "test_uid label1=value1, label2=value2",
-					Labels:      data.Labels{"label1": "value1", "label2": "value2"},
-					State:       eval.Alerting,
-					Results:     []eval.State{eval.Normal, eval.Alerting},
-					StartsAt:    strfmt.DateTime{},
-					EndsAt:      strfmt.DateTime{},
-					EvaluatedAt: strfmt.DateTime(evaluationTime),
+					UID:     "test_uid",
+					OrgID:   123,
+					CacheId: "test_uid label1=value1, label2=value2",
+					Labels:  data.Labels{"label1": "value1", "label2": "value2"},
+					State:   eval.Alerting,
+					Results: []StateEvaluation{
+						{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
+						{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
+					},
+					StartsAt:           evaluationTime.Add(1 * time.Minute),
+					EndsAt:             evaluationTime.Add(100 * time.Second),
+					LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
 				},
 			},
 		},
@@ -87,27 +105,37 @@ func TestProcessEvalResults(t *testing.T) {
 			uid:  "test_uid",
 			evalResults: eval.Results{
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Alerting,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Alerting,
+					EvaluatedAt: evaluationTime,
 				},
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Normal,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Normal,
+					EvaluatedAt: evaluationTime.Add(1 * time.Minute),
 				},
 			},
+			condition: models.Condition{
+				Condition: "A",
+				OrgID:     123,
+			},
 			expectedState:              eval.Normal,
 			expectedReturnedStateCount: 1,
 			expectedResultCount:        2,
 			expectedCacheEntries: []AlertState{
 				{
-					UID:         "test_uid",
-					CacheId:     "test_uid label1=value1, label2=value2",
-					Labels:      data.Labels{"label1": "value1", "label2": "value2"},
-					State:       eval.Normal,
-					Results:     []eval.State{eval.Alerting, eval.Normal},
-					StartsAt:    strfmt.DateTime{},
-					EndsAt:      strfmt.DateTime{},
-					EvaluatedAt: strfmt.DateTime(evaluationTime),
+					UID:     "test_uid",
+					OrgID:   123,
+					CacheId: "test_uid label1=value1, label2=value2",
+					Labels:  data.Labels{"label1": "value1", "label2": "value2"},
+					State:   eval.Normal,
+					Results: []StateEvaluation{
+						{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
+						{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
+					},
+					StartsAt:           time.Time{},
+					EndsAt:             evaluationTime.Add(1 * time.Minute),
+					LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
 				},
 			},
 		},
@@ -116,27 +144,37 @@ func TestProcessEvalResults(t *testing.T) {
 			uid:  "test_uid",
 			evalResults: eval.Results{
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Alerting,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Alerting,
+					EvaluatedAt: evaluationTime,
 				},
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Alerting,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Alerting,
+					EvaluatedAt: evaluationTime.Add(1 * time.Minute),
 				},
 			},
+			condition: models.Condition{
+				Condition: "A",
+				OrgID:     123,
+			},
 			expectedState:              eval.Alerting,
 			expectedReturnedStateCount: 0,
 			expectedResultCount:        2,
 			expectedCacheEntries: []AlertState{
 				{
-					UID:         "test_uid",
-					CacheId:     "test_uid label1=value1, label2=value2",
-					Labels:      data.Labels{"label1": "value1", "label2": "value2"},
-					State:       eval.Alerting,
-					Results:     []eval.State{eval.Alerting, eval.Alerting},
-					StartsAt:    strfmt.DateTime{},
-					EndsAt:      strfmt.DateTime{},
-					EvaluatedAt: strfmt.DateTime(evaluationTime),
+					UID:     "test_uid",
+					OrgID:   123,
+					CacheId: "test_uid label1=value1, label2=value2",
+					Labels:  data.Labels{"label1": "value1", "label2": "value2"},
+					State:   eval.Alerting,
+					Results: []StateEvaluation{
+						{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
+						{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
+					},
+					StartsAt:           time.Time{},
+					EndsAt:             evaluationTime.Add(100 * time.Second),
+					LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
 				},
 			},
 		},
@@ -145,64 +183,103 @@ func TestProcessEvalResults(t *testing.T) {
 			uid:  "test_uid",
 			evalResults: eval.Results{
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Normal,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Normal,
+					EvaluatedAt: evaluationTime,
 				},
 				eval.Result{
-					Instance: data.Labels{"label1": "value1", "label2": "value2"},
-					State:    eval.Normal,
+					Instance:    data.Labels{"label1": "value1", "label2": "value2"},
+					State:       eval.Normal,
+					EvaluatedAt: evaluationTime.Add(1 * time.Minute),
 				},
 			},
+			condition: models.Condition{
+				Condition: "A",
+				OrgID:     123,
+			},
 			expectedState:              eval.Normal,
 			expectedReturnedStateCount: 0,
 			expectedResultCount:        2,
 			expectedCacheEntries: []AlertState{
 				{
-					UID:         "test_uid",
-					CacheId:     "test_uid label1=value1, label2=value2",
-					Labels:      data.Labels{"label1": "value1", "label2": "value2"},
-					State:       eval.Normal,
-					Results:     []eval.State{eval.Normal, eval.Normal},
-					StartsAt:    strfmt.DateTime{},
-					EndsAt:      strfmt.DateTime{},
-					EvaluatedAt: strfmt.DateTime(evaluationTime),
+					UID:     "test_uid",
+					OrgID:   123,
+					CacheId: "test_uid label1=value1, label2=value2",
+					Labels:  data.Labels{"label1": "value1", "label2": "value2"},
+					State:   eval.Normal,
+					Results: []StateEvaluation{
+						{evaluationTime, eval.Normal},
+						{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
+					},
+					StartsAt:           time.Time{},
+					EndsAt:             time.Time{},
+					LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
 				},
 			},
 		},
 	}

 	for _, tc := range testCases {
-		t.Run("the correct number of entries are added to the cache", func(t *testing.T) {
+		t.Run("all fields for a cache entry are set correctly", func(t *testing.T) {
+			st := NewStateTracker(log.New("test_state_tracker"))
+			_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
+			for _, entry := range tc.expectedCacheEntries {
+				if !entry.Equals(st.Get(entry.CacheId)) {
+					t.Log(tc.desc)
+					printEntryDiff(entry, st.Get(entry.CacheId), t)
+				}
+				assert.True(t, entry.Equals(st.Get(entry.CacheId)))
+			}
+		})
+
+		t.Run("the expected number of entries are added to the cache", func(t *testing.T) {
 			st := NewStateTracker(log.New("test_state_tracker"))
 			st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
 			assert.Equal(t, len(tc.expectedCacheEntries), len(st.stateCache.cacheMap))
 		})

-		t.Run("the correct state is set for each evaluation result", func(t *testing.T) {
-			st := NewStateTracker(log.New("test_state_tracker"))
-			st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
-			for _, entry := range tc.expectedCacheEntries {
-				testState := st.get(entry.CacheId)
-				assert.Equal(t, tc.expectedState, testState.State)
-			}
-		})
-
-		t.Run("the correct number of states are returned to the caller", func(t *testing.T) {
+		//This test, as configured, does not quite represent the behavior of the system.
+		//It is expected that each batch of evaluation results will have only one result
+		//for a unique set of labels.
+		t.Run("the expected number of states are returned to the caller", func(t *testing.T) {
 			st := NewStateTracker(log.New("test_state_tracker"))
 			results := st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
-			assert.Equal(t, tc.expectedReturnedStateCount, len(results))
-		})
-
-		t.Run("the correct results are set for each cache entry", func(t *testing.T) {
-			st := NewStateTracker(log.New("test_state_tracker"))
-			_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
-			for _, entry := range tc.expectedCacheEntries {
-				testState := st.get(entry.CacheId)
-				assert.Equal(t, len(entry.Results), len(testState.Results))
-				for i, res := range entry.Results {
-					assert.Equal(t, res, testState.Results[i])
-				}
-			}
+			assert.Equal(t, len(tc.evalResults), len(results))
 		})
 	}
 }
+
+func printEntryDiff(a, b AlertState, t *testing.T) {
+	if a.UID != b.UID {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.UID, b.UID))
+	}
+	if a.OrgID != b.OrgID {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.OrgID, b.OrgID))
+	}
+	if a.CacheId != b.CacheId {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.CacheId, b.CacheId))
+	}
+	if !a.Labels.Equals(b.Labels) {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.Labels, b.Labels))
+	}
+	if a.StartsAt != b.StartsAt {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.StartsAt, b.StartsAt))
+	}
+	if a.EndsAt != b.EndsAt {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.EndsAt, b.EndsAt))
+	}
+	if a.LastEvaluationTime != b.LastEvaluationTime {
+		t.Log(fmt.Sprintf("%v \t %v\n", a.LastEvaluationTime, b.LastEvaluationTime))
+	}
+	if len(a.Results) != len(b.Results) {
+		t.Log(fmt.Sprintf("a: %d b: %d", len(a.Results), len(b.Results)))
+		t.Log("a")
+		for i := 0; i < len(a.Results); i++ {
+			t.Log(fmt.Sprintf("%v\n", a.Results[i]))
+		}
+		t.Log("b")
+		for i := 0; i < len(b.Results); i++ {
+			t.Log(fmt.Sprintf("%v\n", b.Results[i]))
+		}
+	}
+}