Alerting: Clear alerting rule evaluation errors after intermittent failures (#42386)

* Alerting: Clear alerting rule evaluation errors after intermittent failures

When an alert transitioned in a way that `alerting -> error -> (alerting|nodata)`, the error provided by the `error` state would never be cleared thus the API and UI would show the health as an error.
This commit is contained in:
gotjosh 2021-11-26 17:58:19 +00:00 committed by GitHub
parent 725dbf8d95
commit dd5a2e5128
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 188 additions and 1 deletions

View File

@ -3,6 +3,7 @@ package state_test
import (
"context"
"errors"
"fmt"
"testing"
"time"
@ -1254,6 +1255,187 @@ func TestProcessEvalResults(t *testing.T) {
},
},
},
{
desc: "normal -> alerting -> error -> alerting - it should clear the error",
alertRule: &models.AlertRule{
OrgID: 1,
Title: "test_title",
UID: "test_alert_rule_uid_2",
NamespaceUID: "test_namespace_uid",
Annotations: map[string]string{"annotation": "test"},
Labels: map[string]string{"label": "test"},
IntervalSeconds: 10,
For: 30 * time.Second,
},
evalResults: []eval.Results{
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(30 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Error,
Error: fmt.Errorf("Failed to query data"),
EvaluatedAt: evaluationTime.Add(40 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(70 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
},
expectedStates: map[string]*state.State{
`[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`: {
AlertRuleUID: "test_alert_rule_uid_2",
OrgID: 1,
CacheId: `[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`,
Labels: data.Labels{
"__alert_rule_namespace_uid__": "test_namespace_uid",
"__alert_rule_uid__": "test_alert_rule_uid_2",
"alertname": "test_title",
"label": "test",
"instance_label": "test",
},
State: eval.Alerting,
Results: []state.Evaluation{
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(70 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
},
},
StartsAt: evaluationTime.Add(70 * time.Second),
EndsAt: evaluationTime.Add(70 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(70 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
},
},
},
{
desc: "normal -> alerting -> error -> no data - it should clear the error",
alertRule: &models.AlertRule{
OrgID: 1,
Title: "test_title",
UID: "test_alert_rule_uid_2",
NamespaceUID: "test_namespace_uid",
Annotations: map[string]string{"annotation": "test"},
Labels: map[string]string{"label": "test"},
IntervalSeconds: 10,
For: 30 * time.Second,
NoDataState: models.NoData,
},
evalResults: []eval.Results{
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(30 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Error,
Error: fmt.Errorf("Failed to query data"),
EvaluatedAt: evaluationTime.Add(40 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(50 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
},
expectedStates: map[string]*state.State{
`[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`: {
AlertRuleUID: "test_alert_rule_uid_2",
OrgID: 1,
CacheId: `[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`,
Labels: data.Labels{
"__alert_rule_namespace_uid__": "test_namespace_uid",
"__alert_rule_uid__": "test_alert_rule_uid_2",
"alertname": "test_title",
"label": "test",
"instance_label": "test",
},
State: eval.NoData,
Results: []state.Evaluation{
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
},
{
EvaluationTime: evaluationTime.Add(50 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
},
},
StartsAt: evaluationTime.Add(30 * time.Second),
EndsAt: evaluationTime.Add(50 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(50 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
},
},
},
{
desc: "template is correctly expanded",
alertRule: &models.AlertRule{

View File

@ -57,15 +57,18 @@ func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]Evalua
}
func (a *State) resultNormal(alertRule *ngModels.AlertRule, result eval.Result) {
a.Error = result.Error // should be nil since state is not error
if a.State != eval.Normal {
a.EndsAt = result.EvaluatedAt
a.StartsAt = result.EvaluatedAt
}
a.Error = result.Error // should be nil since state is not error
a.State = eval.Normal
}
func (a *State) resultAlerting(alertRule *ngModels.AlertRule, result eval.Result) {
a.Error = result.Error // should be nil since the state is not an error
switch a.State {
case eval.Alerting:
a.setEndsAt(alertRule, result)
@ -118,6 +121,8 @@ func (a *State) resultError(alertRule *ngModels.AlertRule, result eval.Result) {
}
func (a *State) resultNoData(alertRule *ngModels.AlertRule, result eval.Result) {
a.Error = result.Error
if a.StartsAt.IsZero() {
a.StartsAt = result.EvaluatedAt
}