mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: fix updating error in the alert rule state during error to error transitions and restarts (#89557)
Alerting: fix preserving errors in the alert rule state during error to error transitions Alert state transition from one error to another did not update state.Error correctly. The error in state.Error remained as the initial error encountered. This led to another issue, where after a Grafana restart, the error was lost because the state of the alert rule did not change, but the Error is not preserved in the database between restarts. This could happen if the expression service returned an error or the alert routine panicked during querying.
This commit is contained in:
parent
3a29f68d0c
commit
2035814059
@ -3521,6 +3521,123 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "t1[QueryError] t2[GenericError] at t1, t2",
|
||||
|
||||
results: map[time.Time]eval.Results{
|
||||
t1: {
|
||||
newResult(eval.WithError(datasourceError)),
|
||||
},
|
||||
t2: {
|
||||
newResult(eval.WithError(genericError)),
|
||||
},
|
||||
},
|
||||
expectedTransitions: map[ngmodels.ExecutionErrorState]map[time.Time][]StateTransition{
|
||||
ngmodels.ErrorErrState: {
|
||||
t1: {
|
||||
{
|
||||
PreviousState: eval.Normal,
|
||||
State: &State{
|
||||
CacheID: labels["system + rule"].Fingerprint(),
|
||||
Labels: labels["system + rule + datasource-error"],
|
||||
State: eval.Error,
|
||||
Error: datasourceError,
|
||||
LatestResult: newEvaluation(t1, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t1.Add(ResendDelay * 4),
|
||||
LastEvaluationTime: t1,
|
||||
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
|
||||
"Error": datasourceError.Error(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
t2: {
|
||||
{
|
||||
PreviousState: eval.Error,
|
||||
State: &State{
|
||||
CacheID: labels["system + rule"].Fingerprint(),
|
||||
Labels: labels["system + rule + datasource-error"],
|
||||
State: eval.Error,
|
||||
Error: genericError,
|
||||
LatestResult: newEvaluation(t2, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t2.Add(ResendDelay * 4),
|
||||
LastEvaluationTime: t2,
|
||||
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
|
||||
"Error": genericError.Error(),
|
||||
}),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ngmodels.AlertingErrState: {
|
||||
t1: {
|
||||
{
|
||||
PreviousState: eval.Normal,
|
||||
State: &State{
|
||||
Labels: labels["system + rule"],
|
||||
State: eval.Alerting,
|
||||
StateReason: eval.Error.String(),
|
||||
Error: datasourceError,
|
||||
LatestResult: newEvaluation(t1, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t1.Add(ResendDelay * 4),
|
||||
LastEvaluationTime: t1,
|
||||
},
|
||||
},
|
||||
},
|
||||
t2: {
|
||||
{
|
||||
PreviousState: eval.Alerting,
|
||||
PreviousStateReason: eval.Error.String(),
|
||||
State: &State{
|
||||
Labels: labels["system + rule"],
|
||||
State: eval.Alerting,
|
||||
StateReason: eval.Error.String(),
|
||||
Error: genericError,
|
||||
LatestResult: newEvaluation(t2, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t2.Add(ResendDelay * 4),
|
||||
LastEvaluationTime: t2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ngmodels.OkErrState: {
|
||||
t1: {
|
||||
{
|
||||
PreviousState: eval.Normal,
|
||||
State: &State{
|
||||
Labels: labels["system + rule"],
|
||||
State: eval.Normal,
|
||||
StateReason: eval.Error.String(),
|
||||
LatestResult: newEvaluation(t1, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t1,
|
||||
LastEvaluationTime: t1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ngmodels.KeepLastErrState: {
|
||||
t1: {
|
||||
{
|
||||
PreviousState: eval.Normal,
|
||||
State: &State{
|
||||
Labels: labels["system + rule"],
|
||||
State: eval.Normal,
|
||||
StateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast),
|
||||
LatestResult: newEvaluation(t1, eval.Error),
|
||||
StartsAt: t1,
|
||||
EndsAt: t1,
|
||||
LastEvaluationTime: t1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
|
@ -1244,6 +1244,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
EndsAt: tn(6).Add(state.ResendDelay * 4),
|
||||
LastEvaluationTime: tn(6),
|
||||
LastSentAt: util.Pointer(tn(6)), // After 30s resend delay, last sent at is t6.
|
||||
Annotations: map[string]string{"annotation": "test", "Error": "with_state_error"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -141,6 +141,28 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
|
||||
a.EndsAt = nextEndsTime(interval, evaluatedAt)
|
||||
}
|
||||
|
||||
// AddErrorAnnotations adds annotations to the state to indicate that an error occurred.
|
||||
func (a *State) AddErrorAnnotations(err error, rule *models.AlertRule) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
|
||||
a.Annotations["Error"] = err.Error()
|
||||
// If the evaluation failed because a query returned an error then add the Ref ID and
|
||||
// Datasource UID as labels
|
||||
var utilError errutil.Error
|
||||
if errors.As(a.Error, &utilError) &&
|
||||
(errors.Is(a.Error, expr.QueryError) || errors.Is(a.Error, expr.ConversionError)) {
|
||||
for _, next := range rule.Data {
|
||||
if next.RefID == utilError.PublicPayload["refId"].(string) {
|
||||
a.Labels["ref_id"] = next.RefID
|
||||
a.Labels["datasource_uid"] = next.DatasourceUID
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty
|
||||
func IsNormalStateWithNoReason(s *State) bool {
|
||||
return s.State == eval.Normal && s.StateReason == ""
|
||||
@ -272,6 +294,8 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
|
||||
case models.ErrorErrState:
|
||||
if state.State == eval.Error {
|
||||
prevEndsAt := state.EndsAt
|
||||
state.Error = result.Error
|
||||
state.AddErrorAnnotations(result.Error, rule)
|
||||
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
|
||||
logger.Debug("Keeping state",
|
||||
"state",
|
||||
@ -293,23 +317,7 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
|
||||
"next_ends_at",
|
||||
nextEndsAt)
|
||||
state.SetError(result.Error, result.EvaluatedAt, nextEndsAt)
|
||||
|
||||
if result.Error != nil {
|
||||
state.Annotations["Error"] = result.Error.Error()
|
||||
// If the evaluation failed because a query returned an error then add the Ref ID and
|
||||
// Datasource UID as labels
|
||||
var utilError errutil.Error
|
||||
if errors.As(state.Error, &utilError) &&
|
||||
(errors.Is(state.Error, expr.QueryError) || errors.Is(state.Error, expr.ConversionError)) {
|
||||
for _, next := range rule.Data {
|
||||
if next.RefID == utilError.PublicPayload["refId"].(string) {
|
||||
state.Labels["ref_id"] = next.RefID
|
||||
state.Labels["datasource_uid"] = next.DatasourceUID
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
state.AddErrorAnnotations(result.Error, rule)
|
||||
}
|
||||
case models.OkErrState:
|
||||
logger.Debug("Execution error state is Normal", "handler", "resultNormal", "previous_handler", handlerStr)
|
||||
|
Loading…
Reference in New Issue
Block a user