mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Resolve stale state + add state reason to notifications (#49352)
* adds a new reserved annotation `grafana_state_reason` * explicitly resolve stale states
This commit is contained in:
parent
9c2f3045b4
commit
199996cbf9
@ -98,6 +98,13 @@ const (
|
|||||||
|
|
||||||
// FolderTitleLabel is the label that will contain the title of an alert's folder/namespace.
|
// FolderTitleLabel is the label that will contain the title of an alert's folder/namespace.
|
||||||
FolderTitleLabel = GrafanaReservedLabelPrefix + "folder"
|
FolderTitleLabel = GrafanaReservedLabelPrefix + "folder"
|
||||||
|
|
||||||
|
// StateReasonAnnotation is the name of the annotation that explains the difference between evaluation state and alert state (i.e. changing state when NoData or Error).
|
||||||
|
StateReasonAnnotation = GrafanaReservedLabelPrefix + "state_reason"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
StateReasonMissingSeries = "MissingSeries"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -43,6 +43,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post
|
|||||||
nA[ngModels.ImageTokenAnnotation] = alertState.Image.Token
|
nA[ngModels.ImageTokenAnnotation] = alertState.Image.Token
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if alertState.StateReason != "" {
|
||||||
|
nA[ngModels.StateReasonAnnotation] = alertState.StateReason
|
||||||
|
}
|
||||||
|
|
||||||
var urlStr string
|
var urlStr string
|
||||||
if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil {
|
if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil {
|
||||||
u := *appURL
|
u := *appURL
|
||||||
@ -124,6 +128,9 @@ func FromAlertStateToPostableAlerts(firingStates []*state.State, stateManager *s
|
|||||||
}
|
}
|
||||||
alert := stateToPostableAlert(alertState, appURL)
|
alert := stateToPostableAlert(alertState, appURL)
|
||||||
alerts.PostableAlerts = append(alerts.PostableAlerts, *alert)
|
alerts.PostableAlerts = append(alerts.PostableAlerts, *alert)
|
||||||
|
if alertState.StateReason == ngModels.StateReasonMissingSeries { // do not put stale state back to state manager
|
||||||
|
continue
|
||||||
|
}
|
||||||
alertState.LastSentAt = ts
|
alertState.LastSentAt = ts
|
||||||
sentAlerts = append(sentAlerts, alertState)
|
sentAlerts = append(sentAlerts, alertState)
|
||||||
}
|
}
|
||||||
|
@ -135,6 +135,13 @@ func Test_stateToPostableAlert(t *testing.T) {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("should add state reason annotation if not empty", func(t *testing.T) {
|
||||||
|
alertState := randomState(tc.state)
|
||||||
|
alertState.StateReason = "TEST_STATE_REASON"
|
||||||
|
result := stateToPostableAlert(alertState, appURL)
|
||||||
|
require.Equal(t, alertState.StateReason, result.Annotations[ngModels.StateReasonAnnotation])
|
||||||
|
})
|
||||||
|
|
||||||
switch tc.state {
|
switch tc.state {
|
||||||
case eval.NoData:
|
case eval.NoData:
|
||||||
t.Run("should keep existing labels and change name", func(t *testing.T) {
|
t.Run("should keep existing labels and change name", func(t *testing.T) {
|
||||||
|
@ -182,7 +182,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
|||||||
states = append(states, s)
|
states = append(states, s)
|
||||||
processedResults[s.CacheId] = s
|
processedResults[s.CacheId] = s
|
||||||
}
|
}
|
||||||
st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
||||||
if len(states) > 0 {
|
if len(states) > 0 {
|
||||||
logger.Debug("saving new states to the database", "count", len(states))
|
logger.Debug("saving new states to the database", "count", len(states))
|
||||||
for _, state := range states {
|
for _, state := range states {
|
||||||
@ -191,7 +191,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return states
|
return append(states, resolvedStates...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Maybe take a screenshot. Do it if:
|
// Maybe take a screenshot. Do it if:
|
||||||
@ -404,7 +404,8 @@ func (st *Manager) annotateState(ctx context.Context, alertRule *ngModels.AlertR
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) {
|
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State {
|
||||||
|
var resolvedStates []*State
|
||||||
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
||||||
for _, s := range allStates {
|
for _, s := range allStates {
|
||||||
_, ok := states[s.CacheId]
|
_, ok := states[s.CacheId]
|
||||||
@ -422,12 +423,20 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
|
|||||||
}
|
}
|
||||||
|
|
||||||
if s.State == eval.Alerting {
|
if s.State == eval.Alerting {
|
||||||
|
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
|
||||||
|
s.State = eval.Normal
|
||||||
|
s.StateReason = ngModels.StateReasonMissingSeries
|
||||||
|
s.EndsAt = evaluatedAt
|
||||||
|
s.Resolved = true
|
||||||
st.annotateState(ctx, alertRule, s.Labels, evaluatedAt,
|
st.annotateState(ctx, alertRule, s.Labels, evaluatedAt,
|
||||||
InstanceStateAndReason{State: eval.Normal, Reason: ""},
|
InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason},
|
||||||
InstanceStateAndReason{State: s.State, Reason: s.StateReason})
|
previousState,
|
||||||
|
)
|
||||||
|
resolvedStates = append(resolvedStates, s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return resolvedStates
|
||||||
}
|
}
|
||||||
|
|
||||||
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
||||||
|
@ -2156,3 +2156,101 @@ func TestStaleResultsHandler(t *testing.T) {
|
|||||||
assert.Equal(t, tc.finalStateCount, len(existingStatesForRule))
|
assert.Equal(t, tc.finalStateCount, len(existingStatesForRule))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestStaleResults(t *testing.T) {
|
||||||
|
getCacheID := func(t *testing.T, rule *models.AlertRule, result eval.Result) string {
|
||||||
|
t.Helper()
|
||||||
|
labels := data.Labels{}
|
||||||
|
for key, value := range rule.Labels {
|
||||||
|
labels[key] = value
|
||||||
|
}
|
||||||
|
for key, value := range result.Instance {
|
||||||
|
labels[key] = value
|
||||||
|
}
|
||||||
|
lbls := models.InstanceLabels(labels)
|
||||||
|
key, err := lbls.StringKey()
|
||||||
|
require.NoError(t, err)
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
|
||||||
|
checkExpectedStates := func(t *testing.T, actual []*state.State, expected map[string]struct{}) {
|
||||||
|
t.Helper()
|
||||||
|
require.Len(t, actual, len(expected))
|
||||||
|
for _, currentState := range actual {
|
||||||
|
_, ok := expected[currentState.CacheId]
|
||||||
|
require.Truef(t, ok, "State %s is not expected. States: %v", currentState.CacheId, expected)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("should mark missing states as stale", func(t *testing.T) {
|
||||||
|
// init
|
||||||
|
ctx := context.Background()
|
||||||
|
_, dbstore := tests.SetupTestEnv(t, 1)
|
||||||
|
clk := clock.NewMock()
|
||||||
|
clk.Set(time.Now())
|
||||||
|
|
||||||
|
st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clk)
|
||||||
|
|
||||||
|
orgID := rand.Int63()
|
||||||
|
rule := tests.CreateTestAlertRule(t, ctx, dbstore, 10, orgID)
|
||||||
|
|
||||||
|
initResults := eval.Results{
|
||||||
|
eval.Result{
|
||||||
|
Instance: data.Labels{"test1": "testValue1"},
|
||||||
|
State: eval.Alerting,
|
||||||
|
EvaluatedAt: clk.Now(),
|
||||||
|
},
|
||||||
|
eval.Result{
|
||||||
|
Instance: data.Labels{"test1": "testValue2"},
|
||||||
|
State: eval.Alerting,
|
||||||
|
EvaluatedAt: clk.Now(),
|
||||||
|
},
|
||||||
|
eval.Result{
|
||||||
|
Instance: data.Labels{"test1": "testValue3"},
|
||||||
|
State: eval.Normal,
|
||||||
|
EvaluatedAt: clk.Now(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
initStates := map[string]struct{}{
|
||||||
|
getCacheID(t, rule, initResults[0]): {},
|
||||||
|
getCacheID(t, rule, initResults[1]): {},
|
||||||
|
getCacheID(t, rule, initResults[2]): {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init
|
||||||
|
processed := st.ProcessEvalResults(ctx, clk.Now(), rule, initResults, nil)
|
||||||
|
checkExpectedStates(t, processed, initStates)
|
||||||
|
currentStates := st.GetStatesForRuleUID(orgID, rule.UID)
|
||||||
|
checkExpectedStates(t, currentStates, initStates)
|
||||||
|
|
||||||
|
staleDuration := 2 * time.Duration(rule.IntervalSeconds) * time.Second
|
||||||
|
clk.Add(staleDuration)
|
||||||
|
results := eval.Results{
|
||||||
|
eval.Result{
|
||||||
|
Instance: data.Labels{"test1": "testValue1"},
|
||||||
|
State: eval.Alerting,
|
||||||
|
EvaluatedAt: clk.Now(),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
clk.Add(time.Nanosecond) // we use time now when calculate stale states. Evaluation tick and real time are not the same. usually, difference is way greater than nanosecond.
|
||||||
|
expectedStaleReturned := getCacheID(t, rule, initResults[1])
|
||||||
|
processed = st.ProcessEvalResults(ctx, clk.Now(), rule, results, nil)
|
||||||
|
checkExpectedStates(t, processed, map[string]struct{}{
|
||||||
|
getCacheID(t, rule, results[0]): {},
|
||||||
|
expectedStaleReturned: {},
|
||||||
|
})
|
||||||
|
for _, s := range processed {
|
||||||
|
if s.CacheId == expectedStaleReturned {
|
||||||
|
assert.Truef(t, s.Resolved, "Returned stale state should have Resolved set to true")
|
||||||
|
assert.Equal(t, eval.Normal, s.State)
|
||||||
|
assert.Equal(t, models.StateReasonMissingSeries, s.StateReason)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentStates = st.GetStatesForRuleUID(orgID, rule.UID)
|
||||||
|
checkExpectedStates(t, currentStates, map[string]struct{}{
|
||||||
|
getCacheID(t, rule, results[0]): {},
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user