mirror of
https://github.com/grafana/grafana.git
synced 2024-11-26 02:40:26 -06:00
Alerting: Resolve stale state + add state reason to notifications (#49352)
* adds a new reserved annotation `grafana_state_reason` * explicitly resolve stale states
This commit is contained in:
parent
9c2f3045b4
commit
199996cbf9
@ -98,6 +98,13 @@ const (
|
||||
|
||||
// FolderTitleLabel is the label that will contain the title of an alert's folder/namespace.
|
||||
FolderTitleLabel = GrafanaReservedLabelPrefix + "folder"
|
||||
|
||||
// StateReasonAnnotation is the name of the annotation that explains the difference between evaluation state and alert state (i.e. changing state when NoData or Error).
|
||||
StateReasonAnnotation = GrafanaReservedLabelPrefix + "state_reason"
|
||||
)
|
||||
|
||||
var (
|
||||
StateReasonMissingSeries = "MissingSeries"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -43,6 +43,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post
|
||||
nA[ngModels.ImageTokenAnnotation] = alertState.Image.Token
|
||||
}
|
||||
|
||||
if alertState.StateReason != "" {
|
||||
nA[ngModels.StateReasonAnnotation] = alertState.StateReason
|
||||
}
|
||||
|
||||
var urlStr string
|
||||
if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil {
|
||||
u := *appURL
|
||||
@ -124,6 +128,9 @@ func FromAlertStateToPostableAlerts(firingStates []*state.State, stateManager *s
|
||||
}
|
||||
alert := stateToPostableAlert(alertState, appURL)
|
||||
alerts.PostableAlerts = append(alerts.PostableAlerts, *alert)
|
||||
if alertState.StateReason == ngModels.StateReasonMissingSeries { // do not put stale state back to state manager
|
||||
continue
|
||||
}
|
||||
alertState.LastSentAt = ts
|
||||
sentAlerts = append(sentAlerts, alertState)
|
||||
}
|
||||
|
@ -135,6 +135,13 @@ func Test_stateToPostableAlert(t *testing.T) {
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("should add state reason annotation if not empty", func(t *testing.T) {
|
||||
alertState := randomState(tc.state)
|
||||
alertState.StateReason = "TEST_STATE_REASON"
|
||||
result := stateToPostableAlert(alertState, appURL)
|
||||
require.Equal(t, alertState.StateReason, result.Annotations[ngModels.StateReasonAnnotation])
|
||||
})
|
||||
|
||||
switch tc.state {
|
||||
case eval.NoData:
|
||||
t.Run("should keep existing labels and change name", func(t *testing.T) {
|
||||
|
@ -182,7 +182,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
||||
states = append(states, s)
|
||||
processedResults[s.CacheId] = s
|
||||
}
|
||||
st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
||||
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
||||
if len(states) > 0 {
|
||||
logger.Debug("saving new states to the database", "count", len(states))
|
||||
for _, state := range states {
|
||||
@ -191,7 +191,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
||||
}
|
||||
}
|
||||
}
|
||||
return states
|
||||
return append(states, resolvedStates...)
|
||||
}
|
||||
|
||||
// Maybe take a screenshot. Do it if:
|
||||
@ -404,7 +404,8 @@ func (st *Manager) annotateState(ctx context.Context, alertRule *ngModels.AlertR
|
||||
}
|
||||
}
|
||||
|
||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) {
|
||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State {
|
||||
var resolvedStates []*State
|
||||
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
||||
for _, s := range allStates {
|
||||
_, ok := states[s.CacheId]
|
||||
@ -422,12 +423,20 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
|
||||
}
|
||||
|
||||
if s.State == eval.Alerting {
|
||||
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
|
||||
s.State = eval.Normal
|
||||
s.StateReason = ngModels.StateReasonMissingSeries
|
||||
s.EndsAt = evaluatedAt
|
||||
s.Resolved = true
|
||||
st.annotateState(ctx, alertRule, s.Labels, evaluatedAt,
|
||||
InstanceStateAndReason{State: eval.Normal, Reason: ""},
|
||||
InstanceStateAndReason{State: s.State, Reason: s.StateReason})
|
||||
InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason},
|
||||
previousState,
|
||||
)
|
||||
resolvedStates = append(resolvedStates, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
return resolvedStates
|
||||
}
|
||||
|
||||
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
||||
|
@ -2156,3 +2156,101 @@ func TestStaleResultsHandler(t *testing.T) {
|
||||
assert.Equal(t, tc.finalStateCount, len(existingStatesForRule))
|
||||
}
|
||||
}
|
||||
|
||||
func TestStaleResults(t *testing.T) {
|
||||
getCacheID := func(t *testing.T, rule *models.AlertRule, result eval.Result) string {
|
||||
t.Helper()
|
||||
labels := data.Labels{}
|
||||
for key, value := range rule.Labels {
|
||||
labels[key] = value
|
||||
}
|
||||
for key, value := range result.Instance {
|
||||
labels[key] = value
|
||||
}
|
||||
lbls := models.InstanceLabels(labels)
|
||||
key, err := lbls.StringKey()
|
||||
require.NoError(t, err)
|
||||
return key
|
||||
}
|
||||
|
||||
checkExpectedStates := func(t *testing.T, actual []*state.State, expected map[string]struct{}) {
|
||||
t.Helper()
|
||||
require.Len(t, actual, len(expected))
|
||||
for _, currentState := range actual {
|
||||
_, ok := expected[currentState.CacheId]
|
||||
require.Truef(t, ok, "State %s is not expected. States: %v", currentState.CacheId, expected)
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("should mark missing states as stale", func(t *testing.T) {
|
||||
// init
|
||||
ctx := context.Background()
|
||||
_, dbstore := tests.SetupTestEnv(t, 1)
|
||||
clk := clock.NewMock()
|
||||
clk.Set(time.Now())
|
||||
|
||||
st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clk)
|
||||
|
||||
orgID := rand.Int63()
|
||||
rule := tests.CreateTestAlertRule(t, ctx, dbstore, 10, orgID)
|
||||
|
||||
initResults := eval.Results{
|
||||
eval.Result{
|
||||
Instance: data.Labels{"test1": "testValue1"},
|
||||
State: eval.Alerting,
|
||||
EvaluatedAt: clk.Now(),
|
||||
},
|
||||
eval.Result{
|
||||
Instance: data.Labels{"test1": "testValue2"},
|
||||
State: eval.Alerting,
|
||||
EvaluatedAt: clk.Now(),
|
||||
},
|
||||
eval.Result{
|
||||
Instance: data.Labels{"test1": "testValue3"},
|
||||
State: eval.Normal,
|
||||
EvaluatedAt: clk.Now(),
|
||||
},
|
||||
}
|
||||
|
||||
initStates := map[string]struct{}{
|
||||
getCacheID(t, rule, initResults[0]): {},
|
||||
getCacheID(t, rule, initResults[1]): {},
|
||||
getCacheID(t, rule, initResults[2]): {},
|
||||
}
|
||||
|
||||
// Init
|
||||
processed := st.ProcessEvalResults(ctx, clk.Now(), rule, initResults, nil)
|
||||
checkExpectedStates(t, processed, initStates)
|
||||
currentStates := st.GetStatesForRuleUID(orgID, rule.UID)
|
||||
checkExpectedStates(t, currentStates, initStates)
|
||||
|
||||
staleDuration := 2 * time.Duration(rule.IntervalSeconds) * time.Second
|
||||
clk.Add(staleDuration)
|
||||
results := eval.Results{
|
||||
eval.Result{
|
||||
Instance: data.Labels{"test1": "testValue1"},
|
||||
State: eval.Alerting,
|
||||
EvaluatedAt: clk.Now(),
|
||||
},
|
||||
}
|
||||
clk.Add(time.Nanosecond) // we use time now when calculate stale states. Evaluation tick and real time are not the same. usually, difference is way greater than nanosecond.
|
||||
expectedStaleReturned := getCacheID(t, rule, initResults[1])
|
||||
processed = st.ProcessEvalResults(ctx, clk.Now(), rule, results, nil)
|
||||
checkExpectedStates(t, processed, map[string]struct{}{
|
||||
getCacheID(t, rule, results[0]): {},
|
||||
expectedStaleReturned: {},
|
||||
})
|
||||
for _, s := range processed {
|
||||
if s.CacheId == expectedStaleReturned {
|
||||
assert.Truef(t, s.Resolved, "Returned stale state should have Resolved set to true")
|
||||
assert.Equal(t, eval.Normal, s.State)
|
||||
assert.Equal(t, models.StateReasonMissingSeries, s.StateReason)
|
||||
break
|
||||
}
|
||||
}
|
||||
currentStates = st.GetStatesForRuleUID(orgID, rule.UID)
|
||||
checkExpectedStates(t, currentStates, map[string]struct{}{
|
||||
getCacheID(t, rule, results[0]): {},
|
||||
})
|
||||
})
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user