Alerting: Persist alerts on evaluation and shutdown. Warm cache from DB on startup (#32576)

* Initial commit for state tracking

* basic state transition logic and tests

* constructor. test and interface fixup

* use new sig for sch.definitionRoutine()

* test fixup

* make the linter happy

* more minor linting cleanup

* Alerting: Send alerts from state tracker to notifier

* Add evaluation time and test

Add evaluation time and test

* Add cleanup routine and logging

* Pull in compact.go and reconcile differences

* Save alert transitions and save all state on shutdown

* pr feedback

* WIP

* WIP

* Persist alerts on evaluation and shutdown. Warm cache on startup

* Filter non-firing alerts before sending to notifier

Co-authored-by: Josue Abreu <josue@grafana.com>
This commit is contained in:
David Parrott
2021-04-02 08:11:33 -07:00
committed by GitHub
parent 698a1ee003
commit 2a8446e435
10 changed files with 468 additions and 150 deletions

View File

@@ -7,21 +7,26 @@ import (
"github.com/grafana/grafana/pkg/infra/log"
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
type AlertState struct {
UID string
CacheId string
Labels data.Labels
State eval.State
Results []eval.State
StartsAt strfmt.DateTime
EndsAt strfmt.DateTime
EvaluatedAt strfmt.DateTime
UID string
OrgID int64
CacheId string
Labels data.Labels
State eval.State
Results []StateEvaluation
StartsAt time.Time
EndsAt time.Time
LastEvaluationTime time.Time
}
type StateEvaluation struct {
EvaluationTime time.Time
EvaluationState eval.State
}
type cache struct {
@@ -48,7 +53,7 @@ func NewStateTracker(logger log.Logger) *StateTracker {
return tracker
}
func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
func (st *StateTracker) getOrCreate(uid string, orgId int64, result eval.Result) AlertState {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
@@ -58,12 +63,12 @@ func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
}
st.Log.Debug("adding new alert state cache entry", "cacheId", idString, "state", result.State.String(), "evaluatedAt", result.EvaluatedAt.String())
newState := AlertState{
UID: uid,
CacheId: idString,
Labels: result.Instance,
State: result.State,
Results: []eval.State{},
EvaluatedAt: strfmt.DateTime(result.EvaluatedAt),
UID: uid,
OrgID: orgId,
CacheId: idString,
Labels: result.Instance,
State: result.State,
Results: []StateEvaluation{},
}
st.stateCache.cacheMap[idString] = newState
return newState
@@ -75,19 +80,25 @@ func (st *StateTracker) set(stateEntry AlertState) {
st.stateCache.cacheMap[stateEntry.CacheId] = stateEntry
}
func (st *StateTracker) get(stateId string) AlertState {
func (st *StateTracker) Get(stateId string) AlertState {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
return st.stateCache.cacheMap[stateId]
}
//Used to ensure a clean cache on startup
func (st *StateTracker) ResetCache() {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
st.stateCache.cacheMap = make(map[string]AlertState)
}
func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, condition ngModels.Condition) []AlertState {
st.Log.Info("state tracker processing evaluation results", "uid", uid, "resultCount", len(results))
var changedStates []AlertState
for _, result := range results {
if s, ok := st.setNextState(uid, result); ok {
changedStates = append(changedStates, s)
}
s, _ := st.setNextState(uid, condition.OrgID, result)
changedStates = append(changedStates, s)
}
st.Log.Debug("returning changed states to scheduler", "count", len(changedStates))
return changedStates
@@ -99,34 +110,43 @@ func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, con
// 3. The base interval defined by the scheduler - in the case where #2 is not yet an option we can use the base interval at which every alert runs.
//Set the current state based on evaluation results
//return the state and a bool indicating whether a state transition occurred
func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState, bool) {
currentState := st.getOrCreate(uid, result)
func (st *StateTracker) setNextState(uid string, orgId int64, result eval.Result) (AlertState, bool) {
currentState := st.getOrCreate(uid, orgId, result)
st.Log.Debug("setting alert state", "uid", uid)
switch {
case currentState.State == result.State:
st.Log.Debug("no state transition", "cacheId", currentState.CacheId, "state", currentState.State.String())
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
if currentState.State == eval.Alerting {
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
}
st.set(currentState)
return currentState, false
case currentState.State == eval.Normal && result.State == eval.Alerting:
st.Log.Debug("state transition from normal to alerting", "cacheId", currentState.CacheId)
currentState.State = eval.Alerting
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.StartsAt = strfmt.DateTime(result.EvaluatedAt)
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.StartsAt = result.EvaluatedAt
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
st.set(currentState)
return currentState, true
case currentState.State == eval.Alerting && result.State == eval.Normal:
st.Log.Debug("state transition from alerting to normal", "cacheId", currentState.CacheId)
currentState.State = eval.Normal
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt)
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.EndsAt = result.EvaluatedAt
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
st.set(currentState)
return currentState, true
default:
@@ -134,6 +154,16 @@ func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState
}
}
func (st *StateTracker) GetAll() []AlertState {
var states []AlertState
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
for _, v := range st.stateCache.cacheMap {
states = append(states, v)
}
return states
}
func (st *StateTracker) cleanUp() {
ticker := time.NewTicker(time.Duration(60) * time.Minute)
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
@@ -150,16 +180,33 @@ func (st *StateTracker) cleanUp() {
}
func (st *StateTracker) trim() {
st.Log.Info("trimming alert state cache")
st.Log.Info("trimming alert state cache", "now", time.Now())
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
for _, v := range st.stateCache.cacheMap {
if len(v.Results) > 100 {
st.Log.Debug("trimming result set", "cacheId", v.CacheId, "count", len(v.Results)-100)
newResults := make([]eval.State, 100)
newResults := make([]StateEvaluation, 100)
copy(newResults, v.Results[100:])
v.Results = newResults
st.set(v)
}
}
}
func (a AlertState) Equals(b AlertState) bool {
return a.UID == b.UID &&
a.OrgID == b.OrgID &&
a.CacheId == b.CacheId &&
a.Labels.String() == b.Labels.String() &&
a.State.String() == b.State.String() &&
a.StartsAt == b.StartsAt &&
a.EndsAt == b.EndsAt &&
a.LastEvaluationTime == b.LastEvaluationTime
}
func (st *StateTracker) Put(states []AlertState) {
for _, s := range states {
st.set(s)
}
}

View File

@@ -1,13 +1,12 @@
package state
import (
"fmt"
"testing"
"time"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
@@ -34,22 +33,31 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 0,
expectedResultCount: 1,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
LastEvaluationTime: evaluationTime,
},
},
},
@@ -58,27 +66,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Alerting,
expectedReturnedStateCount: 1,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []eval.State{eval.Normal, eval.Alerting},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
},
StartsAt: evaluationTime.Add(1 * time.Minute),
EndsAt: evaluationTime.Add(100 * time.Second),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@@ -87,27 +105,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 1,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Alerting, eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: evaluationTime.Add(1 * time.Minute),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@@ -116,27 +144,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Alerting,
expectedReturnedStateCount: 0,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []eval.State{eval.Alerting, eval.Alerting},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
},
StartsAt: time.Time{},
EndsAt: evaluationTime.Add(100 * time.Second),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@@ -145,64 +183,103 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 0,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Normal, eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{evaluationTime, eval.Normal},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
}
for _, tc := range testCases {
t.Run("the correct number of entries are added to the cache", func(t *testing.T) {
t.Run("all fields for a cache entry are set correctly", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
if !entry.Equals(st.Get(entry.CacheId)) {
t.Log(tc.desc)
printEntryDiff(entry, st.Get(entry.CacheId), t)
}
assert.True(t, entry.Equals(st.Get(entry.CacheId)))
}
})
t.Run("the expected number of entries are added to the cache", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
assert.Equal(t, len(tc.expectedCacheEntries), len(st.stateCache.cacheMap))
})
t.Run("the correct state is set for each evaluation result", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
testState := st.get(entry.CacheId)
assert.Equal(t, tc.expectedState, testState.State)
}
})
t.Run("the correct number of states are returned to the caller", func(t *testing.T) {
//This test, as configured, does not quite represent the behavior of the system.
//It is expected that each batch of evaluation results will have only one result
//for a unique set of labels.
t.Run("the expected number of states are returned to the caller", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
results := st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
assert.Equal(t, tc.expectedReturnedStateCount, len(results))
})
t.Run("the correct results are set for each cache entry", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
testState := st.get(entry.CacheId)
assert.Equal(t, len(entry.Results), len(testState.Results))
for i, res := range entry.Results {
assert.Equal(t, res, testState.Results[i])
}
}
assert.Equal(t, len(tc.evalResults), len(results))
})
}
}
func printEntryDiff(a, b AlertState, t *testing.T) {
if a.UID != b.UID {
t.Log(fmt.Sprintf("%v \t %v\n", a.UID, b.UID))
}
if a.OrgID != b.OrgID {
t.Log(fmt.Sprintf("%v \t %v\n", a.OrgID, b.OrgID))
}
if a.CacheId != b.CacheId {
t.Log(fmt.Sprintf("%v \t %v\n", a.CacheId, b.CacheId))
}
if !a.Labels.Equals(b.Labels) {
t.Log(fmt.Sprintf("%v \t %v\n", a.Labels, b.Labels))
}
if a.StartsAt != b.StartsAt {
t.Log(fmt.Sprintf("%v \t %v\n", a.StartsAt, b.StartsAt))
}
if a.EndsAt != b.EndsAt {
t.Log(fmt.Sprintf("%v \t %v\n", a.EndsAt, b.EndsAt))
}
if a.LastEvaluationTime != b.LastEvaluationTime {
t.Log(fmt.Sprintf("%v \t %v\n", a.LastEvaluationTime, b.LastEvaluationTime))
}
if len(a.Results) != len(b.Results) {
t.Log(fmt.Sprintf("a: %d b: %d", len(a.Results), len(b.Results)))
t.Log("a")
for i := 0; i < len(a.Results); i++ {
t.Log(fmt.Sprintf("%v\n", a.Results[i]))
}
t.Log("b")
for i := 0; i < len(b.Results); i++ {
t.Log(fmt.Sprintf("%v\n", b.Results[i]))
}
}
}