Alerting: Resolve stale state + add state reason to notifications (#49352)

* adds a new reserved annotation `grafana_state_reason`
* explicitly resolve stale states
This commit is contained in:
Yuriy Tseretyan 2022-09-21 13:24:47 -04:00 committed by GitHub
parent 9c2f3045b4
commit 199996cbf9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 133 additions and 5 deletions

View File

@ -98,6 +98,13 @@ const (
// FolderTitleLabel is the label that will contain the title of an alert's folder/namespace.
FolderTitleLabel = GrafanaReservedLabelPrefix + "folder"
// StateReasonAnnotation is the name of the annotation that explains the difference between evaluation state and alert state (i.e. changing state when NoData or Error).
StateReasonAnnotation = GrafanaReservedLabelPrefix + "state_reason"
)
var (
StateReasonMissingSeries = "MissingSeries"
)
var (

View File

@ -43,6 +43,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post
nA[ngModels.ImageTokenAnnotation] = alertState.Image.Token
}
if alertState.StateReason != "" {
nA[ngModels.StateReasonAnnotation] = alertState.StateReason
}
var urlStr string
if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil {
u := *appURL
@ -124,6 +128,9 @@ func FromAlertStateToPostableAlerts(firingStates []*state.State, stateManager *s
}
alert := stateToPostableAlert(alertState, appURL)
alerts.PostableAlerts = append(alerts.PostableAlerts, *alert)
if alertState.StateReason == ngModels.StateReasonMissingSeries { // do not put stale state back to state manager
continue
}
alertState.LastSentAt = ts
sentAlerts = append(sentAlerts, alertState)
}

View File

@ -135,6 +135,13 @@ func Test_stateToPostableAlert(t *testing.T) {
})
})
t.Run("should add state reason annotation if not empty", func(t *testing.T) {
alertState := randomState(tc.state)
alertState.StateReason = "TEST_STATE_REASON"
result := stateToPostableAlert(alertState, appURL)
require.Equal(t, alertState.StateReason, result.Annotations[ngModels.StateReasonAnnotation])
})
switch tc.state {
case eval.NoData:
t.Run("should keep existing labels and change name", func(t *testing.T) {

View File

@ -182,7 +182,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
states = append(states, s)
processedResults[s.CacheId] = s
}
st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
if len(states) > 0 {
logger.Debug("saving new states to the database", "count", len(states))
for _, state := range states {
@ -191,7 +191,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
}
}
}
return states
return append(states, resolvedStates...)
}
// Maybe take a screenshot. Do it if:
@ -404,7 +404,8 @@ func (st *Manager) annotateState(ctx context.Context, alertRule *ngModels.AlertR
}
}
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) {
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State {
var resolvedStates []*State
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
for _, s := range allStates {
_, ok := states[s.CacheId]
@ -422,12 +423,20 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
}
if s.State == eval.Alerting {
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
s.State = eval.Normal
s.StateReason = ngModels.StateReasonMissingSeries
s.EndsAt = evaluatedAt
s.Resolved = true
st.annotateState(ctx, alertRule, s.Labels, evaluatedAt,
InstanceStateAndReason{State: eval.Normal, Reason: ""},
InstanceStateAndReason{State: s.State, Reason: s.StateReason})
InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason},
previousState,
)
resolvedStates = append(resolvedStates, s)
}
}
}
return resolvedStates
}
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {

View File

@ -2156,3 +2156,101 @@ func TestStaleResultsHandler(t *testing.T) {
assert.Equal(t, tc.finalStateCount, len(existingStatesForRule))
}
}
func TestStaleResults(t *testing.T) {
getCacheID := func(t *testing.T, rule *models.AlertRule, result eval.Result) string {
t.Helper()
labels := data.Labels{}
for key, value := range rule.Labels {
labels[key] = value
}
for key, value := range result.Instance {
labels[key] = value
}
lbls := models.InstanceLabels(labels)
key, err := lbls.StringKey()
require.NoError(t, err)
return key
}
checkExpectedStates := func(t *testing.T, actual []*state.State, expected map[string]struct{}) {
t.Helper()
require.Len(t, actual, len(expected))
for _, currentState := range actual {
_, ok := expected[currentState.CacheId]
require.Truef(t, ok, "State %s is not expected. States: %v", currentState.CacheId, expected)
}
}
t.Run("should mark missing states as stale", func(t *testing.T) {
// init
ctx := context.Background()
_, dbstore := tests.SetupTestEnv(t, 1)
clk := clock.NewMock()
clk.Set(time.Now())
st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clk)
orgID := rand.Int63()
rule := tests.CreateTestAlertRule(t, ctx, dbstore, 10, orgID)
initResults := eval.Results{
eval.Result{
Instance: data.Labels{"test1": "testValue1"},
State: eval.Alerting,
EvaluatedAt: clk.Now(),
},
eval.Result{
Instance: data.Labels{"test1": "testValue2"},
State: eval.Alerting,
EvaluatedAt: clk.Now(),
},
eval.Result{
Instance: data.Labels{"test1": "testValue3"},
State: eval.Normal,
EvaluatedAt: clk.Now(),
},
}
initStates := map[string]struct{}{
getCacheID(t, rule, initResults[0]): {},
getCacheID(t, rule, initResults[1]): {},
getCacheID(t, rule, initResults[2]): {},
}
// Init
processed := st.ProcessEvalResults(ctx, clk.Now(), rule, initResults, nil)
checkExpectedStates(t, processed, initStates)
currentStates := st.GetStatesForRuleUID(orgID, rule.UID)
checkExpectedStates(t, currentStates, initStates)
staleDuration := 2 * time.Duration(rule.IntervalSeconds) * time.Second
clk.Add(staleDuration)
results := eval.Results{
eval.Result{
Instance: data.Labels{"test1": "testValue1"},
State: eval.Alerting,
EvaluatedAt: clk.Now(),
},
}
clk.Add(time.Nanosecond) // we use time now when calculate stale states. Evaluation tick and real time are not the same. usually, difference is way greater than nanosecond.
expectedStaleReturned := getCacheID(t, rule, initResults[1])
processed = st.ProcessEvalResults(ctx, clk.Now(), rule, results, nil)
checkExpectedStates(t, processed, map[string]struct{}{
getCacheID(t, rule, results[0]): {},
expectedStaleReturned: {},
})
for _, s := range processed {
if s.CacheId == expectedStaleReturned {
assert.Truef(t, s.Resolved, "Returned stale state should have Resolved set to true")
assert.Equal(t, eval.Normal, s.State)
assert.Equal(t, models.StateReasonMissingSeries, s.StateReason)
break
}
}
currentStates = st.GetStatesForRuleUID(orgID, rule.UID)
checkExpectedStates(t, currentStates, map[string]struct{}{
getCacheID(t, rule, results[0]): {},
})
})
}