Alerting: fix updating error in the alert rule state during error to error transitions and restarts (#89557)

Alerting: fix preserving errors in the alert rule state during error to error transitions

Alert state transition from one error to another did not update state.Error correctly.
The error in state.Error remained as the initial error encountered.
This led to another issue, where after a Grafana restart, the error was lost because
the state of the alert rule did not change, but the Error is not preserved in the database
between restarts.

This could happen if the expression service returned an error or the alert routine panicked
during querying.
This commit is contained in:
Alexander Akhmetov 2024-06-25 09:42:00 +02:00 committed by GitHub
parent 3a29f68d0c
commit 2035814059
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 143 additions and 17 deletions

View File

@ -3521,6 +3521,123 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
},
},
},
{
desc: "t1[QueryError] t2[GenericError] at t1, t2",
results: map[time.Time]eval.Results{
t1: {
newResult(eval.WithError(datasourceError)),
},
t2: {
newResult(eval.WithError(genericError)),
},
},
expectedTransitions: map[ngmodels.ExecutionErrorState]map[time.Time][]StateTransition{
ngmodels.ErrorErrState: {
t1: {
{
PreviousState: eval.Normal,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: datasourceError,
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": datasourceError.Error(),
}),
},
},
},
t2: {
{
PreviousState: eval.Error,
State: &State{
CacheID: labels["system + rule"].Fingerprint(),
Labels: labels["system + rule + datasource-error"],
State: eval.Error,
Error: genericError,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t1,
EndsAt: t2.Add(ResendDelay * 4),
LastEvaluationTime: t2,
Annotations: mergeLabels(baseRule.Annotations, data.Labels{
"Error": genericError.Error(),
}),
},
},
},
},
ngmodels.AlertingErrState: {
t1: {
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Alerting,
StateReason: eval.Error.String(),
Error: datasourceError,
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1.Add(ResendDelay * 4),
LastEvaluationTime: t1,
},
},
},
t2: {
{
PreviousState: eval.Alerting,
PreviousStateReason: eval.Error.String(),
State: &State{
Labels: labels["system + rule"],
State: eval.Alerting,
StateReason: eval.Error.String(),
Error: genericError,
LatestResult: newEvaluation(t2, eval.Error),
StartsAt: t1,
EndsAt: t2.Add(ResendDelay * 4),
LastEvaluationTime: t2,
},
},
},
},
ngmodels.OkErrState: {
t1: {
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Normal,
StateReason: eval.Error.String(),
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1,
LastEvaluationTime: t1,
},
},
},
},
ngmodels.KeepLastErrState: {
t1: {
{
PreviousState: eval.Normal,
State: &State{
Labels: labels["system + rule"],
State: eval.Normal,
StateReason: ngmodels.ConcatReasons(eval.Error.String(), ngmodels.StateReasonKeepLast),
LatestResult: newEvaluation(t1, eval.Error),
StartsAt: t1,
EndsAt: t1,
LastEvaluationTime: t1,
},
},
},
},
},
},
}
for _, tc := range testCases {

View File

@ -1244,6 +1244,7 @@ func TestProcessEvalResults(t *testing.T) {
EndsAt: tn(6).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(6),
LastSentAt: util.Pointer(tn(6)), // After 30s resend delay, last sent at is t6.
Annotations: map[string]string{"annotation": "test", "Error": "with_state_error"},
},
},
},

View File

@ -141,6 +141,28 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
a.EndsAt = nextEndsTime(interval, evaluatedAt)
}
// AddErrorAnnotations adds annotations to the state to indicate that an error occurred.
func (a *State) AddErrorAnnotations(err error, rule *models.AlertRule) {
if err == nil {
return
}
a.Annotations["Error"] = err.Error()
// If the evaluation failed because a query returned an error then add the Ref ID and
// Datasource UID as labels
var utilError errutil.Error
if errors.As(a.Error, &utilError) &&
(errors.Is(a.Error, expr.QueryError) || errors.Is(a.Error, expr.ConversionError)) {
for _, next := range rule.Data {
if next.RefID == utilError.PublicPayload["refId"].(string) {
a.Labels["ref_id"] = next.RefID
a.Labels["datasource_uid"] = next.DatasourceUID
break
}
}
}
}
// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty
func IsNormalStateWithNoReason(s *State) bool {
return s.State == eval.Normal && s.StateReason == ""
@ -272,6 +294,8 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
case models.ErrorErrState:
if state.State == eval.Error {
prevEndsAt := state.EndsAt
state.Error = result.Error
state.AddErrorAnnotations(result.Error, rule)
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Keeping state",
"state",
@ -293,23 +317,7 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
"next_ends_at",
nextEndsAt)
state.SetError(result.Error, result.EvaluatedAt, nextEndsAt)
if result.Error != nil {
state.Annotations["Error"] = result.Error.Error()
// If the evaluation failed because a query returned an error then add the Ref ID and
// Datasource UID as labels
var utilError errutil.Error
if errors.As(state.Error, &utilError) &&
(errors.Is(state.Error, expr.QueryError) || errors.Is(state.Error, expr.ConversionError)) {
for _, next := range rule.Data {
if next.RefID == utilError.PublicPayload["refId"].(string) {
state.Labels["ref_id"] = next.RefID
state.Labels["datasource_uid"] = next.DatasourceUID
break
}
}
}
}
state.AddErrorAnnotations(result.Error, rule)
}
case models.OkErrState:
logger.Debug("Execution error state is Normal", "handler", "resultNormal", "previous_handler", handlerStr)