Alerting: Respect "For" Duration for NoData alerts (#65574)

* Alerting: Respect "For" Duration for NoData alerts

This change modifies `resultNoData` to be more inline with the logic of the other state handlers.

The main effects of this are:

1) NoData states with NoDataState config set to Alerting will respect "For" duration.
2) Prevents zero value in StartsAt and EndsAt for alerts that have only even been in normal state. This includes state transitions from NoDataState=OK and ExecErrState=OK.
3) Better state transition logging.
This commit is contained in:
Matthew Jacobson 2023-03-31 12:05:15 -04:00 committed by GitHub
parent ed8628e39d
commit b9dc04139a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 152 additions and 104 deletions

View File

@ -84,11 +84,11 @@ Configure alerting behavior in the absence of data using information in the foll
| No Data Option | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| No Data | Create a new alert `DatasourceNoData` with the name and UID of the alert rule, and UID of the datasource that returned no data as labels. |
| Alerting | Set alert rule state to `Alerting`. |
| Alerting | Set alert rule state to `Alerting`. This option will respect the configured **Evaluate for** pending period. |
| Ok | Set alert rule state to `Normal`. |
| Error or timeout option | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| Alerting | Set alert rule state to `Alerting`. From Grafana 8.5, the alert rule waits for the entire duration for which the condition is true before firing. |
| OK | Set alert rule state to `Normal` |
| re | Error | Create a new alert `DatasourceError` with the name and UID of the alert rule, and UID of the datasource that returned no data as labels. |
| Error or timeout option | Description |
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| Error | Create a new alert `DatasourceError` with the name and UID of the alert rule, and UID of the datasource that returned no data as labels. |
| Alerting | Set alert rule state to `Alerting`. This option will respect the configured **Evaluate for** pending period. |
| OK | Set alert rule state to `Normal` |

View File

@ -126,8 +126,8 @@ func (rs *ruleStates) getOrCreate(ctx context.Context, log log.Logger, alertRule
return state
}
// If the first result we get is alerting, set StartsAt to EvaluatedAt because we
// do not have data for determining StartsAt otherwise
// For new states, we set StartsAt & EndsAt to EvaluatedAt as this is the
// expected value for a Normal state during state transition.
newState := &State{
AlertRuleUID: alertRule.UID,
OrgID: alertRule.OrgID,
@ -136,9 +136,8 @@ func (rs *ruleStates) getOrCreate(ctx context.Context, log log.Logger, alertRule
Annotations: annotations,
EvaluationDuration: result.EvaluationDuration,
Values: values,
}
if result.State == eval.Alerting {
newState.StartsAt = result.EvaluatedAt
StartsAt: result.EvaluatedAt,
EndsAt: result.EvaluatedAt,
}
rs.states[id] = newState
return newState

View File

@ -330,6 +330,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -385,6 +387,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -472,6 +476,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -784,7 +790,7 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(20 * time.Second),
StartsAt: evaluationTime.Add(30 * time.Second),
EndsAt: evaluationTime.Add(30 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationDuration: evaluationDuration,
@ -923,7 +929,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
{
desc: "normal -> alerting when result is NoData and NoDataState is alerting",
desc: "normal -> pending when For is set but not exceeded, result is NoData and NoDataState is alerting",
alertRule: &models.AlertRule{
OrgID: 1,
Title: "test_title",
@ -932,6 +938,7 @@ func TestProcessEvalResults(t *testing.T) {
Annotations: map[string]string{"annotation": "test"},
Labels: map[string]string{"label": "test"},
IntervalSeconds: 10,
For: 1 * time.Minute,
NoDataState: models.Alerting,
},
evalResults: []eval.Results{
@ -966,7 +973,7 @@ func TestProcessEvalResults(t *testing.T) {
"instance_label": "test",
},
Values: make(map[string]float64),
State: eval.Alerting,
State: eval.Pending,
StateReason: eval.NoData.String(),
Results: []state.Evaluation{
{
@ -988,6 +995,102 @@ func TestProcessEvalResults(t *testing.T) {
},
},
},
{
desc: "normal -> alerting when For is exceeded, result is NoData and NoDataState is alerting",
alertRule: &models.AlertRule{
OrgID: 1,
Title: "test_title",
UID: "test_alert_rule_uid_2",
NamespaceUID: "test_namespace_uid",
Annotations: map[string]string{"annotation": "test"},
Labels: map[string]string{"label": "test"},
IntervalSeconds: 10,
For: 30 * time.Second,
NoDataState: models.Alerting,
},
evalResults: []eval.Results{
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(20 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(30 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(40 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
},
expectedAnnotations: 2,
expectedStates: map[string]*state.State{
`[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`: {
AlertRuleUID: "test_alert_rule_uid_2",
OrgID: 1,
CacheID: `[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`,
Labels: data.Labels{
"__alert_rule_namespace_uid__": "test_namespace_uid",
"__alert_rule_uid__": "test_alert_rule_uid_2",
"alertname": "test_title",
"label": "test",
"instance_label": "test",
},
Values: make(map[string]float64),
State: eval.Alerting,
StateReason: eval.NoData.String(),
Results: []state.Evaluation{
{
EvaluationTime: evaluationTime.Add(20 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(40 * time.Second),
EndsAt: evaluationTime.Add(40 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
},
},
},
{
desc: "normal -> nodata when result is NoData and NoDataState is nodata",
alertRule: &models.AlertRule{
@ -1105,8 +1208,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -1196,8 +1299,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -1222,8 +1325,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -1320,8 +1423,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime.Add(20 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -1411,76 +1514,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
},
},
},
{
desc: "EndsAt set correctly. normal -> alerting when result is NoData and NoDataState is alerting and For is set and For is breached",
alertRule: &models.AlertRule{
OrgID: 1,
Title: "test_title",
UID: "test_alert_rule_uid_2",
NamespaceUID: "test_namespace_uid",
Annotations: map[string]string{"annotation": "test"},
Labels: map[string]string{"label": "test"},
IntervalSeconds: 10,
For: 1 * time.Minute,
NoDataState: models.Alerting,
},
evalResults: []eval.Results{
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
EvaluationDuration: evaluationDuration,
},
},
{
eval.Result{
Instance: data.Labels{"instance_label": "test"},
State: eval.NoData,
EvaluatedAt: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
},
},
},
expectedAnnotations: 1,
expectedStates: map[string]*state.State{
`[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`: {
AlertRuleUID: "test_alert_rule_uid_2",
OrgID: 1,
CacheID: `[["__alert_rule_namespace_uid__","test_namespace_uid"],["__alert_rule_uid__","test_alert_rule_uid_2"],["alertname","test_title"],["instance_label","test"],["label","test"]]`,
Labels: data.Labels{
"__alert_rule_namespace_uid__": "test_namespace_uid",
"__alert_rule_uid__": "test_alert_rule_uid_2",
"alertname": "test_title",
"label": "test",
"instance_label": "test",
},
Values: make(map[string]float64),
State: eval.Alerting,
StateReason: eval.NoData.String(),
Results: []state.Evaluation{
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -1800,6 +1835,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -2147,7 +2184,7 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(30 * time.Second),
StartsAt: evaluationTime.Add(50 * time.Second),
EndsAt: evaluationTime.Add(50 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(50 * time.Second),
EvaluationDuration: evaluationDuration,
@ -2200,6 +2237,8 @@ func TestProcessEvalResults(t *testing.T) {
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"summary": "grafana is down in us-central-1 cluster -> prod namespace"},
@ -2372,6 +2411,8 @@ func TestStaleResultsHandler(t *testing.T) {
Condition: "A",
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime,
LastEvaluationTime: evaluationTime,
EvaluationDuration: 0,
Annotations: map[string]string{"testAnnoKey": "testAnnoValue"},

View File

@ -259,21 +259,29 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
}
}
func resultNoData(state *State, rule *models.AlertRule, result eval.Result, _ log.Logger) {
state.Error = result.Error
if state.StartsAt.IsZero() {
state.StartsAt = result.EvaluatedAt
}
state.EndsAt = nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
func resultNoData(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger) {
switch rule.NoDataState {
case models.Alerting:
state.State = eval.Alerting
logger.Debug("Execution no data state is Alerting", "handler", "resultAlerting", "previous_handler", "resultNoData")
resultAlerting(state, rule, result, logger)
state.StateReason = models.NoData.String()
case models.NoData:
state.State = eval.NoData
if state.State == eval.NoData {
logger.Debug("Keeping state", "state", state.State)
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
} else {
// This is the first occurrence of no data
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.NoData)
state.SetNoData("", result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
}
case models.OK:
state.State = eval.Normal
logger.Debug("Execution no data state is Normal", "handler", "resultNormal", "previous_handler", "resultNoData")
resultNormal(state, rule, result, logger)
state.StateReason = models.NoData.String()
default:
err := fmt.Errorf("unsupported no data state: %s", rule.NoDataState)
state.SetError(err, state.StartsAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
state.Annotations["Error"] = err.Error()
}
}